Python爬虫多线程爬搜索引擎

来源:转载

爬搜索引擎的信息要注意page和key的变化,还有正则表达式一定要正确

爬下面的URL:    http://weixin.sogou.com/weixin?type=2&query=

后面再跟page信息

一共三个线程,第一个负责把URL存到队列中去,第二个URL负责读取需要的信息并储存,第三个如果队列为空,则结束

import queueimport threadingimport urllib.requestimport urllib.errorimport reimport timeurlqueue = queue.Queue()#获得html文档def GetData(url): try: headers = ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36") opener = urllib.request.build_opener() opener.addheaders = [headers] urllib.request.install_opener(opener) data = urllib.request.urlopen(url).read().decode('utf-8') return data except urllib.error.URLError as e: if hasattr(e, 'code'): print(e.code) if hasattr(e, 'reason'): print(e.reason) time.sleep(10) except Exception as e: print("exception:" + str(e)) time.sleep(1)# thread1class GetUrl(threading.Thread): def __init__(self, key, pagestart, pageend, urlqueue): threading.Thread.__init__(self) self.key = key self.pagestart = pagestart self.pageend = pageend self.urlqueue = urlqueue def run(self): keycode = urllib.request.quote(self.key) pagecode = urllib.request.quote("&page=") for page in range(self.pagestart, self.pageend+1): url = "http://weixin.sogou.com/weixin?type=2&query="+keycode+pagecode+str(page) data = GetData(url) listurlpattern = '<div class="txt-box">.*?(http://.*?)"' page_urls = re.compile(listurlpattern, re.S).findall(data) for page_url in page_urls: page_url = page_url.replace("amp;", "") self.urlqueue.put(page_url) self.urlqueue.task_done()class GetConnect(threading.Thread): def __init__(self, urlqueue): threading.Thread.__init__(self) self.urlqueue = urlqueue def run(self): html1 = ''' <html> <head> <title>微信文章</title> </head> <body> ''' fh = open("1.html", 'wb') fh.write(html1.encode('utf-8')) fh.close() fh = open("1.html", 'ab') i = 1 while(True): try: url = self.urlqueue.get() print(url) data = GetData(url) titlepat = '<title>(.*?)</title>' contentpat = 'id="js_content">(.*?)id="js_sg_bar"' title = re.compile(titlepat, re.S).findall(data) content = re.compile(contentpat, re.S).findall(data) thistitle = "no" thiscontent = "no" if (title != []): thistitle = title[0] if(content != []): thiscontent = content[0] dataall = "<p>标题是:"+thistitle+"</p><p>内容是:"+thiscontent+"</p><br>" fh.write(dataall.encode('utf-8')) print("第"+str(i)+"个网页处理") i += 1 except urllib.request.URLError as e: if hasattr(e, 'code'): print(e.code) if hasattr(e, 'reason'): print(e.reason) time.sleep(10) except Exception as e: print("exception:" + str(e)) time.sleep(1) fh.close() html2 = ''' </body> </html> ''' fh = open("1.html", 'ab') fh.write(html2.encode('utf-8')) fh.close()class Conrl(threading.Thread): def __init__(self, urlqueue): threading.Thread.__init__(self) self.urlqueue = urlqueue def run(self): while(True): print("程序执行ing") time.sleep(60) if self.urlqueue.empty(): print("执行完毕") exit()key = "IT"pagestart = 1pageend = 2thread1 = GetUrl(key, pagestart, pageend, urlqueue)thread1.start()thread2 = GetConnect(urlqueue)thread2.start()thread3 = Conrl(urlqueue)thread3.start()



分享给朋友:
您可能感兴趣的文章:
随机阅读: