北京市药品监督管理局药品经营企业信息爬取与导出
import\s+re\nfrom\s+lxml\s+import\s+etree\nimport\s+requests\nimport\s+xlsxwriter\n\n\ndef\s+write(infors_list,\s+file_path):\n\tworkbook\s+=\s+xlsxwriter.Workbook(file_path,\s+{'nan_inf_to_errors':\s+True})\s+#\s+新建excel表\n\tworksheet\s+=\s+workbook.add_worksheet('sheet1')\s+#\s+新建sheet(sheet的名称为"sheet1")\n\theadings\s+=\s+['企业名称',\s+'许可证号',\s+'法定代表人',\s+'企业负责人',\s+'质量负责人',\s+'注册地址',\s+'仓库地址',\n\t '经营方式','经营范围',\s+'发证机关',\s+'发证日期',\s+'有效期至','GSP证书编号','GSP认证日期','GSP证书有效期']\s+#\s+设置表头\n\tworksheet.write_row('A1',\s+headings)\n\tfor\s+inf\s+in\s+infors_list:\n\t index\s+=\s+infors_list.index(inf)\n\t worksheet.write_row('A{:}'.format(index\s++\s+2),\s+inf)\n\tworkbook.close()\n\n\nif\s+name\s+==\s+'main':\n\tinformation_list\s+=\s+[]\n\turl\s+=\s+'http://xxcx.yjj.beijing.gov.cn/eportal/ui?pageId=723648'\n\theaders\s+=\s+{\n\t 'User-Agent':\s+'Mozilla/5.0\s+(Windows\s+NT\s+10.0;\s+Win64;\s+x64)\s+AppleWebKit/537.36\s+(KHTML,\s+like\s+Gecko)\s+Chrome/114.0.0.0\s+Safari/537.36\s+Edg/114.0.1823.67',\n\t 'Cookie':\s+'JSESSIONID=D971DB606FFE24772DCC609F1C8B355D; _va_ses=; _va_id=3cc84763318b0380.1689467249.1.1689475757.1689467249.'\n\t}\n\tnumber\s+=\s+int(input('请输入前几页:'))\n\tfor\s+i\s+in\s+range(1,\s+number):\n\t data\s+=\s+{\n\t 'fKey':'',\n\t 'sKey':'',\n\t 'searchType':'',\n\t 'filter_LIKE_XKZH':\s+'',\n\t 'filter_LIKE_TITLE':\s+'',\n\t 'currentPage':\s+'{:}'.format(str(i)),\n\t 'pageSize':\s+'15'\n\t }\n\t response\s+=\s+requests.post(url=url,\s+headers=headers,\s+data=data)\n\t #\s+print(response.text)\n\t urls\s+=\s+re.findall('<td\s+align="center"><a\s+target="_blank"\s+href="(.?)">',\s+response.text,\s+re.S)\n\t #\s+print(urls)\n\t for\s+u\s+in\s+urls:\n\t xiao_list\s+=\s+[]\n\t uurl\s+=\s+'http://xxcx.yjj.beijing.gov.cn' + u\n\t response1\s+=\s+requests.get(url=uurl,\s+headers=headers).text\n\t #\s+print(response1)\n\t html\s+=\s+etree.HTML(response1)\n\t companyname\s+=\s+html.xpath('//[@id="c13053ea8cbd43abba9a4073492f00de"]/div[2]/table/tbody/tr[1]/td/text()')\n\t print(companyname)\n\t xkzh\s+=\s+html.xpath('//[@id="c13053ea8cbd43abba9a4073492f00de"]/div[2]/table/tbody/tr[2]/td/text()')\n\t fddbr\s+=\s+html.xpath('//[@id="c13053ea8cbd43abba9a4073492f00de"]/div[2]/table/tbody/tr[3]/td/text()')\n\t qyfzr\s+=\s+html.xpath('//[@id="c13053ea8cbd43abba9a4073492f00de"]/div[2]/table/tbody/tr[4]/td/text()')\n\t zlfzr\s+=\s+html.xpath('//[@id="c13053ea8cbd43abba9a4073492f00de"]/div[2]/table/tbody/tr[5]/td/text()')\n\t zcdz\s+=\s+html.xpath('//[@id="c13053ea8cbd43abba9a4073492f00de"]/div[2]/table/tbody/tr[6]/td/text()')\n\t ckdz\s+=\s+html.xpath('//[@id="c13053ea8cbd43abba9a4073492f00de"]/div[2]/table/tbody/tr[7]/td/text()')\n\t jyfs\s+=\s+html.xpath('//[@id="c13053ea8cbd43abba9a4073492f00de"]/div[2]/table/tbody/tr[8]/td/text()')\n\t jyfw\s+=\s+html.xpath('//[@id="c13053ea8cbd43abba9a4073492f00de"]/div[2]/table/tbody/tr[9]/td/text()')\n\t fzjg\s+=\s+html.xpath('//[@id="c13053ea8cbd43abba9a4073492f00de"]/div[2]/table/tbody/tr[10]/td/text()')\n\t fzrq\s+=\s+html.xpath('//[@id="c13053ea8cbd43abba9a4073492f00de"]/div[2]/table/tbody/tr[11]/td/text()')\n\t yxqz\s+=\s+html.xpath('//[@id="c13053ea8cbd43abba9a4073492f00de"]/div[2]/table/tbody/tr[12]/td/text()')\n\t zsbh\s+=\s+html.xpath('//[@id="c13053ea8cbd43abba9a4073492f00de"]/div[2]/table/tbody/tr[13]/td/text()')\n\t rdrq\s+=\s+html.xpath('//[@id="c13053ea8cbd43abba9a4073492f00de"]/div[2]/table/tbody/tr[14]/td/text()')\n\t zsyxq\s+=\s+html.xpath('//*[@id="c13053ea8cbd43abba9a4073492f00de"]/div[2]/table/tbody/tr[15]/td/text()')\n\n\n\t xiao_list.append(companyname[0])\n\t xiao_list.append(xkzh[0])\n\t xiao_list.append(fddbr[0])\n\t xiao_list.append(qyfzr[0])\n\t xiao_list.append(zlfzr[0])\n\t xiao_list.append(zcdz[0])\n\t xiao_list.append(ckdz[0])\n\t xiao_list.append(jyfs[0])\n\t if\s+jyfw:\s+#\s+如果列表不为空\n\t xiao_list.append(jyfw[0])\n\t xiao_list.append(fzrq[0])\n\t xiao_list.append(yxqz[0])\n\t if\s+zsbh:\s+#\s+如果列表不为空\n\t xiao_list.append(zsbh[0])\n\t else:\s+#\s+如果列表为空\n\t xiao_list.append(None)\s+#\s+或者添加你认为合适的默认值\n\t if\s+rdrq:\s+#\s+如果列表不为空\n\t xiao_list.append(rdrq[0])\n\t else:\s+#\s+如果列表为空\n\t xiao_list.append(None)\s+#\s+或者添加你认为合适的默认值\n\t if\s+zsyxq:\s+#\s+如果列表不为空\n\t xiao_list.append(zsyxq[0])\n\t else:\s+#\s+如果列表为空\n\t xiao_list.append(None)\s+#\s+或者添加你认为合适的默认值\n\n\t information_list.append(xiao_list)\n\twrite(information_list,\s+f'{number}.xlsx')\n为什么jyfw那里取到excel表里是空值内容:有可能是因为jyfw的xpath表达式有误,导致无法正确提取信息。可以检查一下jyfw的xpath表达式是否正确,也可以打印出jyfw的值进行调试,看看是否能正确获取到信息。另外,还可以检查一下网页源代码,确认一下jyfw所在的位置是否有变化。
原文地址: https://www.cveoy.top/t/topic/pPlb 著作权归作者所有。请勿转载和采集!