python多线程百度收录

查询收录数,收录率,未收录链接,收录的链接
需要安装pycurl模块
需要查询的URL放在url.csv里面文件必须是utf-8格式
运行BDshoulu.py文件
在Windows下面的命令提示符下运行会乱码,print的内容会乱码,请自行转码,不影响结果。
遇到验证码的时候会停止5分钟重新查
线程建议不要开太多,否则会导致封IP

#coding:utf-8
import pycurl,re,StringIO
import  threading,Queue,time

class caiji:
        #打开网页  url:网页URL
        def html(self,url):
                while 1:
                        try:
                                b=StringIO.StringIO()
                                c=pycurl.Curl()
                                c.setopt(pycurl.URL,url) #打开URL
                                c.setopt(pycurl.FOLLOWLOCATION,2) #允许跟踪来源,有参数:1和2
                                c.setopt(pycurl.ENCODING, 'gzip')  #开启gzip压缩提高下载速度
                                c.setopt(pycurl.NOSIGNAL, True)   #开启后多线程不会报错
                                c.setopt(pycurl.MAXREDIRS,1) #最大重定向次数,0表示不重定向
                                c.setopt(pycurl.CONNECTTIMEOUT,60) #链接超时
                                c.setopt(pycurl.TIMEOUT,30)  #下载超时
                                c.setopt(pycurl.USERAGENT,'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)')
                                #pycurl.USERAGENT  模拟浏览器
                                c.setopt(pycurl.WRITEFUNCTION, b.write)  #回调写入字符串缓存
                                c.perform() #执行上述访问网址的操作
                                # print c.getinfo(pycurl.HTTP_CODE)
                                c.close()
                                html=b.getvalue()   #读取b中的数据
                                return html    #跳出并返回html
                        except:
                                continue


wurl=open(r"url1.csv",'a')

caiji=caiji()

class count:
        def __init__(self):
                self.shoulu=0
                self.wshoulu=0
                self.i=0
                self.lock=threading.Lock()


        def c_wshoulu(self):
                self.lock.acquire()
                self.wshoulu+=1
                wshoulu=self.wshoulu
                self.lock.release()
                return wshoulu


        def c_sl(self):
                self.lock.acquire()
                self.shoulu+=1
                shoulu=self.shoulu
                self.lock.release()
                return shoulu


        def c_i(self):
                self.lock.acquire()
                self.i+=1
                i=self.i
                self.lock.release()
                return i

count=count()

class th(threading.Thread):
        def __init__(self,qurl):
                threading.Thread.__init__(self)
                self.qurl=qurl
                self.lock=threading.Lock()
                self.cond=threading.Condition()


        def run(self):
                while 1:
                        ddc=self.qurl.get()
                        if ddc is  None:
                                break
                        while 1:
                                bdhtm=caiji.html('http://www.baidu.com/s?wd='+ddc)
                                self.lock.acquire()

                                if '百度为您找到相关结果约' in bdhtm:
                                        i=count.c_i()
                                        print '第%s条, %s ,收录'% (i,ddc)
                                        wurl.writelines('第%s条, %s ,收录\n'% (i,ddc))
                                        count.c_sl()
                                        break

                                elif '抱歉,没有找到与' in bdhtm:
                                        i=count.c_i()
                                        print '第%s条, %s ,未收录'% (i,ddc)
                                        wurl.writelines('第%s条, %s ,未收录\n'% (i,ddc))
                                        count.c_wshoulu()
                                        break

                                elif 'http://verify.baidu.com/' in bdhtm:
                                        print ddc,'出现验证码,等待5分钟后自动开始'
                                        self.lock.release()
                                        time.sleep(500)
                                        continue

                                else:
                                        print 'Error'
                                        break
                        self.lock.release()


qurl=Queue.Queue(0)
threadCount=6    #开启线程数,默认6个线程

ths=[]
for t in range(threadCount):
        thread=th(qurl)
        thread.start()
        ths.append(thread)

for ddc in open(r'url.csv'):   #导入需要查询的URL文件,格式必须是utf-8
        ddc=ddc[0:-1]
        qurl.put(ddc)

for tt in range(threadCount):
        qurl.put(None)

for t in ths:
        t.join()

sl=count.c_sl()-1

print  '\n收录率:'+str(round(float(sl)/float(count.c_i()-1)*100,2)),"%"
print '收录:%s 条'%str(sl)
print '未收录:%s 条'%str(count.c_wshoulu()-1)


本文:python多线程百度收录
文章地址:/zs/6.html 转载请保留链接地址

上一篇:返回列表
下一篇:ubuntu下安装python