bs4

bs4是BeautifulSoup简称,能更好的解析html和xml的。 比如我们有如下html文档。

<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>

bs4 节点选择器

bs 方法选择器

bs CSS选择器

 1# [节点选择器 start]
 2html_content="""
 3    <div class="panel">
 4        <div class="panel-heading">
 5            <h4>Hello</h4>
 6        </div>
 7        <div class="panel-body">
 8            <ul class="list" id="list-1">
 9                <li class="element">Foo</li>
10                <li class="element">Bar</li>
11                <li class="element">Jay</li>
12            </ul>
13            <ul class="list list-small" id="list-2">
14                <li class="element">Foo</li>
15                <li class="element">Bar</li>
16            </ul>
17        </div>
18    </div>
19"""
20from bs4 import BeautifulSoup
21soup = BeautifulSoup(html_content, 'lxml')
22print(soup.div.ul.li.contents)
23# ['Foo']
24# [节点选择器 end]
25
26
27# 方法选择器
28from bs4 import BeautifulSoup
29soup = BeautifulSoup(html_content, 'lxml')
30print(soup.find_all(name="li"))
31
32
33
34# css选择器
35
36from bs4 import BeautifulSoup
37soup = BeautifulSoup(html_content, 'lxml')
38# print(soup.select("div.panel-body > ul.list"))
39# print(soup.select('.panel .panel-heading'))
40print(soup.select('ul li'))
41for li in soup.select('ul li'):
42    print(li.text)
43