2
python 包之 PyQuery 网页解析教程
source link: https://blog.51cto.com/autofelix/5241179
Go to the source link to view the article. You can view the picture content, updated content and better typesetting reading experience. If the link is broken, please click the button below to view the snapshot at that time.
- 是一个非常强大又灵活的网页解析库
- PyQuery 是 Python 仿照 jQuery 的严格实现
- 语法与 jQuery 几乎完全相同,更多操作可以参考jQuery
pip install pyquery
二、字符串初始化
html = '''
<ul id="container">
<li class="wow fadeIn">
<div class="d-flex latest-small-thumb">
<div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
<a class="color-white" href="single.html" tabindex="0">
<img src="assets/imgs/news/thumb-11.jpg" alt="">
</a>
</div>
<div class="post-content media-body align-self-center">
<h5 class="post-title mb-15 text-limit-3-row font-medium">
<a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
</h5>
</div>
</div>
</li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
print(doc)
print(type(doc))
print(doc('li'))
<ul id="container">
<li class="wow fadeIn">
<div class="d-flex latest-small-thumb">
<div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
<a class="color-white" href="single.html" tabindex="0">
<img src="assets/imgs/news/thumb-11.jpg" alt="">
</a>
</div>
<div class="post-content media-body align-self-center">
<h5 class="post-title mb-15 text-limit-3-row font-medium">
<a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
</h5>
</div>
</div>
</li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
print(doc)
print(type(doc))
print(doc('li'))
三、url初始化
from pyquery import PyQuery as pq
doc = pq(url="http://www.baidu.com", encoding='utf-8')
print(doc('head')
doc = pq(url="http://www.baidu.com", encoding='utf-8')
print(doc('head')
四、文件初始化
from pyquery import PyQuery as pq
doc = pq(filename='index.html')
print(doc)
doc = pq(filename='index.html')
print(doc)
五、css选择器
html = '''
<ul id="container">
<li class="wow fadeIn">
<div class="d-flex latest-small-thumb">
<div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
<a class="color-white" href="single.html" tabindex="0">
<img src="assets/imgs/news/thumb-11.jpg" alt="">
</a>
</div>
<div class="post-content media-body align-self-center">
<h5 class="post-title mb-15 text-limit-3-row font-medium">
<a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
</h5>
</div>
</div>
</li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('#container .fadeIn'))
<ul id="container">
<li class="wow fadeIn">
<div class="d-flex latest-small-thumb">
<div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
<a class="color-white" href="single.html" tabindex="0">
<img src="assets/imgs/news/thumb-11.jpg" alt="">
</a>
</div>
<div class="post-content media-body align-self-center">
<h5 class="post-title mb-15 text-limit-3-row font-medium">
<a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
</h5>
</div>
</div>
</li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('#container .fadeIn'))
六、查找子元素
html = '''
<ul id="container">
<li class="wow fadeIn">
<div class="d-flex latest-small-thumb">
<div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
<a class="color-white" href="single.html" tabindex="0">
<img src="assets/imgs/news/thumb-11.jpg" alt="">
</a>
</div>
<div class="post-content media-body align-self-center">
<h5 class="post-title mb-15 text-limit-3-row font-medium">
<a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
</h5>
</div>
</div>
</li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('#container')
lis = items.find('li')
print(type(lis))
print(lis)
<ul id="container">
<li class="wow fadeIn">
<div class="d-flex latest-small-thumb">
<div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
<a class="color-white" href="single.html" tabindex="0">
<img src="assets/imgs/news/thumb-11.jpg" alt="">
</a>
</div>
<div class="post-content media-body align-self-center">
<h5 class="post-title mb-15 text-limit-3-row font-medium">
<a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
</h5>
</div>
</div>
</li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('#container')
lis = items.find('li')
print(type(lis))
print(lis)
七、兄弟元素
html = '''
<ul id="container">
<li class="wow fadeIn">
<div class="d-flex latest-small-thumb">
<div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
<a class="color-white" href="single.html" tabindex="0">
<img src="assets/imgs/news/thumb-11.jpg" alt="">
</a>
</div>
<div class="post-content media-body align-self-center">
<h5 class="post-title mb-15 text-limit-3-row font-medium">
<a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
</h5>
</div>
</div>
</li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
div = doc('#container .post-thumb')
print(div.siblings())
<ul id="container">
<li class="wow fadeIn">
<div class="d-flex latest-small-thumb">
<div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
<a class="color-white" href="single.html" tabindex="0">
<img src="assets/imgs/news/thumb-11.jpg" alt="">
</a>
</div>
<div class="post-content media-body align-self-center">
<h5 class="post-title mb-15 text-limit-3-row font-medium">
<a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
</h5>
</div>
</div>
</li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
div = doc('#container .post-thumb')
print(div.siblings())
八、获取属性
html = '''
<ul id="container">
<li class="wow fadeIn">
<div class="d-flex latest-small-thumb">
<div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
<a class="color-white" href="single.html" tabindex="0">
<img src="assets/imgs/news/thumb-11.jpg" alt="">
</a>
</div>
<div class="post-content media-body align-self-center">
<h5 class="post-title mb-15 text-limit-3-row font-medium">
<a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
</h5>
</div>
</div>
</li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('#container .post-content a')
print(a)
print(a.attr('href'))
print(a.attr.href)
<ul id="container">
<li class="wow fadeIn">
<div class="d-flex latest-small-thumb">
<div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
<a class="color-white" href="single.html" tabindex="0">
<img src="assets/imgs/news/thumb-11.jpg" alt="">
</a>
</div>
<div class="post-content media-body align-self-center">
<h5 class="post-title mb-15 text-limit-3-row font-medium">
<a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
</h5>
</div>
</div>
</li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('#container .post-content a')
print(a)
print(a.attr('href'))
print(a.attr.href)
九、获取文本
html = '''
<ul id="container">
<li class="wow fadeIn">
<div class="d-flex latest-small-thumb">
<div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
<a class="color-white" href="single.html" tabindex="0">
<img src="assets/imgs/news/thumb-11.jpg" alt="">
</a>
</div>
<div class="post-content media-body align-self-center">
<h5 class="post-title mb-15 text-limit-3-row font-medium">
<a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
</h5>
</div>
</div>
</li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('#container .post-content a').text()
print(a)
<ul id="container">
<li class="wow fadeIn">
<div class="d-flex latest-small-thumb">
<div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
<a class="color-white" href="single.html" tabindex="0">
<img src="assets/imgs/news/thumb-11.jpg" alt="">
</a>
</div>
<div class="post-content media-body align-self-center">
<h5 class="post-title mb-15 text-limit-3-row font-medium">
<a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
</h5>
</div>
</div>
</li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('#container .post-content a').text()
print(a)
十、类操作
html = '''
<ul id="container">
<li class="wow fadeIn">
<div class="d-flex latest-small-thumb">
<div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
<a class="color-white" href="single.html" tabindex="0">
<img src="assets/imgs/news/thumb-11.jpg" alt="">
</a>
</div>
<div class="post-content media-body align-self-center">
<h5 class="post-title mb-15 text-limit-3-row font-medium">
<a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
</h5>
</div>
</div>
</li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('#container li')
print(li)
li.removeClass('fadeIn')
print(li)
li.addClass('fadeIn')
print(li)
<ul id="container">
<li class="wow fadeIn">
<div class="d-flex latest-small-thumb">
<div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
<a class="color-white" href="single.html" tabindex="0">
<img src="assets/imgs/news/thumb-11.jpg" alt="">
</a>
</div>
<div class="post-content media-body align-self-center">
<h5 class="post-title mb-15 text-limit-3-row font-medium">
<a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
</h5>
</div>
</div>
</li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('#container li')
print(li)
li.removeClass('fadeIn')
print(li)
li.addClass('fadeIn')
print(li)
Recommend
About Joyk
Aggregate valuable and interesting links.
Joyk means Joy of geeK