pyquery
pyquery 是一个基于lxml的HTML解析库,可以用来解析和操作HTML文档。 他是可以操作html文档的, 这里我们主要演示查询能力。
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
bs4 节点选择器
bs 方法选择器
bs CSS选择器
1# [节点选择器 start]
2html_content="""
3 <div class="panel">
4 <div class="panel-heading">
5 <h4>Hello</h4>
6 </div>
7 <div class="panel-body">
8 <ul class="list" id="list-1">
9 <li class="element">Foo</li>
10 <li class="element">Bar</li>
11 <li class="element">Jay</li>
12 </ul>
13 <ul class="list list-small" id="list-2">
14 <li class="element">Foo</li>
15 <li class="element">Bar</li>
16 </ul>
17 </div>
18 </div>
19"""
20from bs4 import BeautifulSoup
21soup = BeautifulSoup(html_content, 'lxml')
22print(soup.div.ul.li.contents)
23# ['Foo']
24# [节点选择器 end]
25
26
27# 方法选择器
28from bs4 import BeautifulSoup
29soup = BeautifulSoup(html_content, 'lxml')
30print(soup.find_all(name="li"))
31
32
33
34# css选择器
35
36from bs4 import BeautifulSoup
37soup = BeautifulSoup(html_content, 'lxml')
38# print(soup.select("div.panel-body > ul.list"))
39# print(soup.select('.panel .panel-heading'))
40print(soup.select('ul li'))
41for li in soup.select('ul li'):
42 print(li.text)
43