-- coding:utf-8 --

import urllib.request import urllib.error import re import os

webroot = 'http://www.ixuanshu.net'

for page in range(20,220): print ('正在下载第'+str(page)+'页小说')

url = 'http://www.ixuanshu.net/soft/sort02/index_'+str(page)+'.html'
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'  }
try:
    request = urllib.request.Request(url,headers=headers)
    response = urllib.request.urlopen(request,timeout=180)
    #print response.read()
except urllib.error.URLError as e:
    if hasattr(e,"code"):
        print(e.code)
    if hasattr(e,"reason"):
        print(e.reason)

html = response.read().decode('utf-8')
#print html
pattern = re.compile(u'<li>.*?<div class="s">.*?target="_blank">(.*?)</a><br />大小:(.*?)<br>.*?</em><br>更新:(.*?)</div>.*?<a href="(.*?)"><img.*?>(.*?)</a>.*?<div class="u">(.*?)</div>',re.S)
items = re.findall(pattern,html)
#print items

for item in items:
    try:
        book_auther = item[0].encode('gbk')
        book_size = item[1].encode('gbk')
        book_updatetime = item[2].encode('gbk')
        book_link = item[3].encode('gbk')
        book_name = item[4].encode('gbk')
        book_note = item[5].encode('gbk')

        book_full_link = webroot + book_link    # 构建书的绝对地址

        #请求地址
        try:
            request = urllib.request.Request(book_full_link,headers=headers)
            response = urllib.request.urlopen(request,timeout=180)
        except urllib.error.URLError as e:
            if hasattr(e,"code"):
                print(e.code)
            if hasattr(e,"reason"):
                print(e.reason)
        html = response.read().decode('utf-8')
        #print html
        pattern = re.compile('<a class="downButton.*?<a class="downButton" href=\'(.*?)\'.*?Txt.*?</a>',re.S)
        down_link = re.findall(pattern,html)
        print(book_name.decode('gbk'))
        print(down_link)

        # down txt
        try:
            request = urllib.request.Request(down_link[0].encode('utf-8'),headers=headers)
            response = urllib.request.urlopen(request,timeout=180)
        except urllib.error.URLError as e:
            if hasattr(e,"code"):
                print(e.code)
            if hasattr(e,"reason"):
                print(e.reason)
        try:
            fp = open(book_name.decode('gbk')+'.txt','w',encoding='utf-8')
        except IOError as e:
            pattern = re.compile('<strong>.*?>(.*?)<.*?</strong>',re.S)
            book_name = re.findall(pattern,book_name.decode('gbk'))
            fp = open(book_name[0]+'.txt','w',encoding='utf-8')
        print('start download')
        fp.write(response.read().decode('gbk'))
        print('down finish\n')
        fp.close()
    except Exception as e:
        print('该条目解析出现错误,忽略')
        print(e)
        print('')
        fp = open('error.log','a',encoding='utf-8')
        fp.write('page:'+str(page)+'\n')
        fp.write(item[4].encode('gbk').decode('utf-8'))
        #fp.write(e)
        fp.write('\nThere is an error in parsing process.\n\n')
        fp.close(
# -- codingutf-8 --import urllibimport urllib2import reimport oswebroot = httpwwwixuanshunetfor page in range20220 print 正在下载第+strpage+页小说 url = httpwwwixuanshunetsoftsort02index_+strpage+ht

原文地址: https://www.cveoy.top/t/topic/fDEA 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录