如何在PHP中实现Web抓取器?[已关闭]
2022-08-30 09:30:57
哪些内置的PHP函数对网页抓取有用?有哪些好的资源(网络或印刷品)可以快速了解PHP的网络抓取速度?
哪些内置的PHP函数对网页抓取有用?有哪些好的资源(网络或印刷品)可以快速了解PHP的网络抓取速度?
抓取通常包括 3 个步骤:
为了完成步骤1和2,下面是一个简单的php类,它使用Curl通过GET或POST获取网页。取回 HTML 后,只需使用正则表达式通过解析要抓取的文本来完成步骤 3。
对于正则表达式,我最喜欢的教程网站如下: 正则表达式教程
我最喜欢的使用正则表达式的程序是正则表达式Buddy。我建议您尝试该产品的演示,即使您无意购买它。它是一个非常宝贵的工具,甚至可以为您用您选择的语言(包括php)制作的正则表达式生成代码。
用法:
$curl = new Curl();
$html = $curl->get("http://www.google.com");
// now, do your regex work against $html
PHP 类:
<?php
class Curl
{
public $cookieJar = "";
public function __construct($cookieJarFile = 'cookies.txt') {
$this->cookieJar = $cookieJarFile;
}
function setup()
{
$header = array();
$header[0] = "Accept: text/xml,application/xml,application/xhtml+xml,";
$header[0] .= "text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
$header[] = "Cache-Control: max-age=0";
$header[] = "Connection: keep-alive";
$header[] = "Keep-Alive: 300";
$header[] = "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7";
$header[] = "Accept-Language: en-us,en;q=0.5";
$header[] = "Pragma: "; // browsers keep this blank.
curl_setopt($this->curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.8.1.7) Gecko/20070914 Firefox/2.0.0.7');
curl_setopt($this->curl, CURLOPT_HTTPHEADER, $header);
curl_setopt($this->curl,CURLOPT_COOKIEJAR, $this->cookieJar);
curl_setopt($this->curl,CURLOPT_COOKIEFILE, $this->cookieJar);
curl_setopt($this->curl,CURLOPT_AUTOREFERER, true);
curl_setopt($this->curl,CURLOPT_FOLLOWLOCATION, true);
curl_setopt($this->curl,CURLOPT_RETURNTRANSFER, true);
}
function get($url)
{
$this->curl = curl_init($url);
$this->setup();
return $this->request();
}
function getAll($reg,$str)
{
preg_match_all($reg,$str,$matches);
return $matches[1];
}
function postForm($url, $fields, $referer='')
{
$this->curl = curl_init($url);
$this->setup();
curl_setopt($this->curl, CURLOPT_URL, $url);
curl_setopt($this->curl, CURLOPT_POST, 1);
curl_setopt($this->curl, CURLOPT_REFERER, $referer);
curl_setopt($this->curl, CURLOPT_POSTFIELDS, $fields);
return $this->request();
}
function getInfo($info)
{
$info = ($info == 'lasturl') ? curl_getinfo($this->curl, CURLINFO_EFFECTIVE_URL) : curl_getinfo($this->curl, $info);
return $info;
}
function request()
{
return curl_exec($this->curl);
}
}
?>
我推荐Goutte,一个简单的PHP Web Scraper。
创建一个 Goutte 客户端实例(扩展):Symfony\Component\BrowserKit\Client
use Goutte\Client;
$client = new Client();
使用以下方法发出请求:request()
$crawler = $client->request('GET', 'http://www.symfony-project.org/');
该方法返回一个对象 ()。request
Crawler
Symfony\Component\DomCrawler\Crawler
点击链接:
$link = $crawler->selectLink('Plugins')->link();
$crawler = $client->click($link);
提交表格:
$form = $crawler->selectButton('sign in')->form();
$crawler = $client->submit($form, array('signin[username]' => 'fabien', 'signin[password]' => 'xxxxxx'));
提取数据:
$nodes = $crawler->filter('.error_list');
if ($nodes->count())
{
die(sprintf("Authentification error: %s\n", $nodes->text()));
}
printf("Nb tasks: %d\n", $crawler->filter('#nb_tasks')->text());