bs4库学习笔记

学习自bs4官方文档https://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/

# 安装bs4库
pip install beautifulsoup4

from bs4 import BeautifulSoup

#解析网页，创建BeautifulSoup对象
#第一个参数为网页代码
#第二个参数为解析器
soup = BeautifulSoup(html_doc, 'html.parser')

BeautifulSoup处理后，对象可分为四种类型：Tag, NavigableString, BeautifulSoup, Comment

Tag类型

# 获得a标签（其余同理）
tag = soup.a

# 获得标签名字
# 若改变该名字，soup中所有该标签名字都将改变
tag.name

# 获取标签属性
# 属性可以被修改
tag['class'] #获取标签属性
tag.attrs #获取所有属性键值对

# 多值属性，略

# 将所有子结点以列表的方式输出
tag.contents

# 循环所有tag子节点
tag.children

# 递归循环所有tag子孙结点
tag.descendants

# 输出tag子节点
# 若只有一个NavigableString类型子节点或仅一个子节点输出子节点
# 若有多个子节点，无法判断输出哪个，则输出None
tag.string

# 循环输出tag中的多个字符串
# 用.stripped_strings清除空白
tag.strings
soup.stripped_strings

# 获取父节点
# 顶层结点父节点为BeautifulSoup
# BeautifulSoup的父节点为None
tag.parent

# 获取兄弟结点
tag.next_sibling
tag.previous_sibling

# 迭代输出tag的兄弟节点
for sibling in soup.a.next_siblings:
    pass

# 输出下一个解析对象
tag.next_element
tag.previous_element

# 迭代向前向后访问解析内容
for element in soup.next_elements:

NavigableString类型

# 获得tag中的字符串
tag.string

# 修改NavigableString
tag.string.replace_with('')

# 转为unicode字符串
unicode_string = unicode(tag.string)

BeautifulSoup类型

# find_all方法的使用

# 方法包含参数
# name查找所有名字为name的tag
# kwargs参数把传入参数当作指定名字的tag属性来搜索
# 有的参数不能搜索，可以通过attrs参数定义一个字典来搜索含特殊属性的tag
# string参数搜索文档中的字符串内容
# limit参数，限定返回数量
# recursive默认为True，返回所有子孙节点，False则返回直接子节点
find_all( name , attrs , recursive , string , **kwargs)

# 搜索所有b标签
soup.find_all('b')

# 搜索所有正则表达式匹配内容
soup.find_all(re.compile('t')) #所有含't'的标签

# 搜索多个条件
soup.find_all(['a', 'b']) #所有a标签和b标签

# 搜索所有tag，不含字符串结点
soup.find_all(True)

# 自定义方法过滤标签
def has_class_but_no_id(tag):
    return tag.has_attr('class') and not tag.has_attr('id')
soup.find_all(has_class_but_no_id)

# 自定义方法过滤一类标签属性
def not_lacie(href):
        return href and not re.compile("lacie").search(href)
soup.find_all(href=not_lacie)

# 自定义方法过滤前后都有文字的标签
from bs4 import NavigableString
def surrounded_by_strings(tag):
    return (isinstance(tag.next_element, NavigableString)
            and isinstance(tag.previous_element, NavigableString))

for tag in soup.find_all(surrounded_by_strings):
    print tag.name
    
# CSS选择器
soup.find_all('a', class_ = 'sister')

css

1
2
3

# CSS选择器

Comment类型

略

# 找到所有a标签
find_all('a')

# 将所有子节点以列表的方式输出
head.contents