#encoding:utf-8 import urllib,re,random,time,sys,socket try: from io import StringIO except ImportError: from StringIO import StringIO try: import pycurl except: pass from bs4 import BeautifulSoup score={1: 28.56, 2: 19.23, 3: 10.20, 4: 8.14, 5: 7.50, 6: 5.72, 7: 4.01, 8: 4.41, 9: 5.53, 10: 6.70,}

#获取根域名,百度产品直接显示子域名 def root_domain(url): if 'baidu.com' in url: return url else: try: url = url.replace('http://', '') l = ['.com.cn', '.org.cn', '.net.cn', '.gov.cn'] for suffix in l: if suffix in url: return re.search('^(.?..?)([^.]+?.[^.]+?.[^.]+)', url).group(2) return re.search('^(.?..?)([^.]+?.[^.]+)', url).group(2) except: return '-'

def curl(url, debug=False, **kwargs): list=['Mozilla/5.0 (Windows NT 5.1; rv:37.0) Gecko/20100101 Firefox/37.0','Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36','Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36'] randhead=random.sample(list,1) while 1: try: s = StringIO() c = pycurl.Curl() c.setopt(pycurl.URL, url) c.setopt(pycurl.REFERER, url) c.setopt(pycurl.FOLLOWLOCATION, True) c.setopt(pycurl.TIMEOUT, 60) c.setopt(pycurl.ENCODING, 'gzip') c.setopt(pycurl.USERAGENT, '%s'%randhead[0]) c.setopt(pycurl.NOSIGNAL, True) c.setopt(pycurl.WRITEFUNCTION, s.write) for k, v in kwargs.items(): c.setopt(vars(pycurl)[k], v) c.perform() c.close() return s.getvalue() except: if debug: raise continue

def get_baidudata(keyword,rn): search_url = 'http://www.baidu.com/s?wd=%s&rn=%d'%(urllib.parse.quote(keyword),rn) pagetext = curl(search_url) #获取百度搜索结果源代码

while 'http://verify.baidu.com' in pagetext: #判断 如果查询过程中出现验证码则提示并停止10分钟,然后重新查询
    print(u'查询过程出现验证码,休息10分钟',keyword)
    time.sleep(600)
    pagetext = curl(search_url)
else:
    soup = BeautifulSoup(pagetext, 'html.parser')
    data = soup.find_all('div',attrs={'class':'result c-container '})#提取自然排名结果
    return data

return

def get_rank_data(keyword,rn): data = get_baidudata(keyword,rn)#获取自然排名结果 items = {} for result in data: g = result.find_all('a',attrs={'class':'c-color-gray'})#获取主域名 if g: site=re.search(r'([a-zA-Z0-9.-]+)',g[0].text) host = site.groups(1)[0] host=root_domain(host)#获取根域名 rank = int(result['id'])#排名 if host not in items.keys(): items[host] = [] items[host].append(score[rank]) else: items[host].append(score[rank]) return items#返回单个词前十数据

def get_keywords(filename):#读取关键词返回列表 kwfile = open(filename,'r') keywords = kwfile.readline() kw_list = [] while keywords: kw = keywords.strip() kw_list.append(kw) keywords = kwfile.readline() kwfile.close() return kw_list

def get_all_data(filename,rn):#单域名数据合并 kw_list = get_keywords(filename) items = {} for i,kw in enumerate(kw_list,1): print(i,kw) item = get_rank_data(kw,rn) for host,rank in item.items(): if host not in items.keys(): items[host] = rank else: items[host].extend(rank) return items

def get_score(filename,rn): data = get_all_data(filename,rn) fh = open('score.csv','a+', encoding='utf-8') fh.write('host,kws,average_score,host_score,\n')

for host,rank in data.items():
    if host != None:
        host = host.encode('utf-8')
    else:
        host = 'error page'
    kws = len(rank)#关键词数
    host_score = sum(rank)#总得分
    average_score = host_score/kws#平均分
    fh.write(str(host)+','+str(kws)+','+str(average_score)+','+str(host_score)+'\n')
return

if name=='main': file=input('请输入包含关键词的文件名:') get_score(file,10)

Python 错误修复:ModuleNotFoundError: No module named 'StringIO'

原文地址: https://www.cveoy.top/t/topic/m3kA 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录