Java 抓取网页导演信息:使用 Jsoup 库解析 HTML
可以使用 Jsoup 库来解析 HTML 代码,然后使用 CSS 选择器来获取导演信息。示例代码如下:
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Main {
public static void main(String[] args) {
String html = '<div class="list-wrapper">
' + " <h2>导演 Director</h2>
" + "
" + "
" + '<ul class="celebrities-list __multiline">
' + "
" + "
" + "
" + "
" + "
" + " <li class="celebrity">
" + "
" + "
" + " <a href="https://movie.douban.com/celebrity/1276086/" title="郭帆 Frant Gwo" class="">
" + " <div class="avatar" style="background-image: url(https://img9.doubanio.com/view/personage/raw/public/300a2139dffdb8cd3eba50101766a4e4.jpg)">
" + " </div>
" + " </a>
" + "
" + " <div class="info">
" + " <span class="name"><a href="https://movie.douban.com/celebrity/1276086/" title="郭帆 Frant Gwo" class="name">郭帆 Frant Gwo</a></span>
" + "
" + " <span class="role" title="导演 Director">导演 Director</span>
" + "
" + " <span class="works">
" + " 代表作:
" + "
" + " <a href="https://movie.douban.com/subject/26266893/" target="_blank" title="流浪地球">流浪地球</a>
" + "
" + " <a href="https://movie.douban.com/subject/35267208/" target="_blank" title="流浪地球2">流浪地球2</a>
" + "
" + " <a href="https://movie.douban.com/subject/27110296/" target="_blank" title="无名之辈">无名之辈</a>
" + " </span>
" + " </div>
" + " </li>
" + '</ul></div>';
Document doc = Jsoup.parse(html);
Elements directors = doc.select("span.role[title='导演 Director']");
for (Element director : directors) {
System.out.println(director.parent().select("a.name").text());
}
}
}
输出结果:
郭帆 Frant Gwo
原文地址: https://www.cveoy.top/t/topic/nehI 著作权归作者所有。请勿转载和采集!