Python Script to Extract Heading and Body Text from HTML
Here's the Python code that fulfills the given requirement:
from bs4 import BeautifulSoup
# Read the HTML file
with open("index.html", "r") as file:
html_data = file.read()
# Parse the HTML data using BeautifulSoup
soup = BeautifulSoup(html_data, "html.parser")
# Find the desired title
desired_title = "Specific Title" # Replace with the desired title
title_element = soup.find(lambda tag: tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"] and tag.get_text(strip=True) == desired_title)
if title_element:
# Find all the body text within <p> tags under the desired title
body_text = []
next_element = title_element.find_next_sibling()
while next_element and next_element.name != "h1" and next_element.name != "h2" and next_element.name != "h3" and next_element.name != "h4" and next_element.name != "h5" and next_element.name != "h6":
if next_element.name == "p":
body_text.append(next_element.get_text(strip=True))
next_element = next_element.find_next_sibling()
# Print the found heading and body text
print("Heading:", title_element.get_text(strip=True))
print("Body Text:")
for text in body_text:
print(text)
else:
print("Title not found.")
Note: Replace the value of desired_title with the specific title you want to find. Make sure the HTML file "index.html" is in the same directory as the Python script.
原文地址: https://www.cveoy.top/t/topic/qc9h 著作权归作者所有。请勿转载和采集!