PHP
发布时间:2019-11-12 发布网站:大佬教程 code.js-code.com
大佬教程收集整理的这篇文章主要介绍了php与python实现的线程池多线程爬虫功能示例,大佬教程大佬觉得挺不错的,现在分享给大家,也给大家做个参考。
本文实例讲述了php与python实现的线程池多线程爬虫功能。分享给大家供大家参考,具体如下:
多线程爬虫可以用于抓取内容了这个可以提升性能了,这里我们来看php与python 线程池多线程爬虫的例子,代码如下:
php例子
php;">
url = $url;
}
public function run()
{
$ch = $this->worker->getConnection(
);
curl_setopt($ch,CURLOPT_URL,$this->url
);
$page = curl_exec($ch
);
$info = curl_ge
Tinfo($ch
);
$error = curl_error($ch
);
$this->deal_data($this->url,$page,$info,$error
);
$this->result = $page;
}
function deal_data($url,$error)
{
$parts = explode(".",$url
);
$id = $parts[1];
if ($info['
http_code']
!= 200)
{
$this->show_msg($id,$error
);
} else
{
$this->show_msg($id,"OK"
);
}
}
function show_msg($id,$msg)
{
echo $id."\t$msg\n";
}
public function getResult()
{
return $this->result;
}
protected $url;
protected $result;
}
function
check_urls_multi_pthreads()
{
global $
check_urls; //定义抓取的连接
$
check_urls = array( '
http://xxx.com' => "xx网",
);
$pool = new Pool(10,"Connect",array()
); //建立10个线程池
foreach ($
check_urls as $url => $
Name)
{
$pool->submit(new Query($url)
);
}
$pool->shutdown(
);
}
check_urls_multi_pthreads(
);
python 多线程
def handle(sid)://
这个方法内执行爬虫数据处理
pass
class
myThread(Thread):
"""doc
String for ClassName"""
def __init__(self,sid):
Thread.__init__(self)
self.sid = sid
def run():
handle(self.sid)
threads = []
for i in xrange(1,11):
t =
myThread(i)
thread
s.append(t)
t.start()
for t in threads:
t.join()
python 线程池爬虫:
s.get()
print(url)
sock = socket.socket()
sock.connect(('localhost',3000))
get = 'GET
{}
http/1.0\r\nHost: localhost\r\n\r\n'.format(url)
sock.send(get.encode('ascii'))
response = b''
chunk = sock.recv(4096)
while chunk:
response += chunk
chunk = sock.recv(4096)
links = self.parse_links(url,respons
E)
lock.acquire()
for link in link
s.difference(seen_urls):
self.task
s.put(link)
seen_url
s.update(links)
lock.release()
self.task
s.task_done()
def parse_links(self,fetched_url,respons
E):
if not response:
print('error:
{}'.format(fetched_url))
return set()
if not self._is_html(respons
E):
return set()
urls = set(re.findall(r'''(?i)href=["']?(
[^\s"'<>]+)''',self
.body(respons
E)))
links = set()
for url in urls:
normalized = urllib.parse.urljoin(fetched_url,url)
parts = urllib.parse.urlparse(normalized)
if part
s.scheR_564_11845@e not in ('','
http','
https'):
con
Tinue
host,port = urllib.parse.splitport(part
s.netlo
C)
if host and host.lower() not in ('localhost'):
con
Tinue
defragmented,frag = urllib.parse.urldefrag(part
s.path)
link
s.add(defragmented)
return links
def body(self,respons
E):
body = response.split(b'\r\n\r\n',1)[1]
return body.decode('utf-8')
def _is_html(self,respons
E):
head,body = response.split(b'\r\n\r\n',1)
headers =
Dict(h.split(': ') for h in head.decode().split('\r\n')[1:])
return header
s.get('Content-Type','').startswith('text/html')
class ThreadPool:
def __init__(self,num_threads):
self.tasks = Queue()
for _ in range(num_threads):
Fetcher(self.tasks)
def add_task(self,url):
self.task
s.put(url)
def wait_completion(self):
self.task
s.join()
if __name__ == '__main__':
start = time.time()
pool = ThreadPool(4)
poo
l.add_task("/")
poo
l.wait_completion()
print('
{} URLs fetched in
{:.1f} seconds'.format(len(seen_urls),time.time() - start))
更多关于php相关内容感兴趣的读者可查看本站专题:《》、《》、《》、《》、《》、《》、《》、《》、《》、《》及《php常见数据库操作技巧汇总》
希望本文所述对大家php程序设计有所帮助。
大佬总结
以上是大佬教程为你收集整理的php与python实现的线程池多线程爬虫功能示例全部内容,希望文章能够帮你解决php与python实现的线程池多线程爬虫功能示例所遇到的程序开发问题。
如果觉得大佬教程网站内容还不错,欢迎将大佬教程推荐给程序员好友。
本图文内容来源于网友网络收集整理提供,作为学习参考使用,版权属于原作者。
如您有任何意见或建议可联系处理。小编QQ:384754419,请注明来意。