学习自bs4官方文档https://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/
1 2 3 4 5 6 7 8 9
| pip install beautifulsoup4
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')
|
BeautifulSoup处理后,对象可分为四种类型:Tag
, NavigableString
, BeautifulSoup
, Comment
Tag类型
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
| tag = soup.a
tag.name
tag['class'] tag.attrs
tag.contents
tag.children
tag.descendants
tag.string
tag.strings soup.stripped_strings
tag.parent
tag.next_sibling tag.previous_sibling
for sibling in soup.a.next_siblings: pass
tag.next_element tag.previous_element
for element in soup.next_elements:
|
NavigableString类型
1 2 3 4 5 6 7 8 9 10
| tag.string
tag.string.replace_with('')
unicode_string = unicode(tag.string)
|
BeautifulSoup类型
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
|
find_all( name , attrs , recursive , string , **kwargs)
soup.find_all('b')
soup.find_all(re.compile('t'))
soup.find_all(['a', 'b'])
soup.find_all(True)
def has_class_but_no_id(tag): return tag.has_attr('class') and not tag.has_attr('id') soup.find_all(has_class_but_no_id)
def not_lacie(href): return href and not re.compile("lacie").search(href) soup.find_all(href=not_lacie)
from bs4 import NavigableString def surrounded_by_strings(tag): return (isinstance(tag.next_element, NavigableString) and isinstance(tag.previous_element, NavigableString))
for tag in soup.find_all(surrounded_by_strings): print tag.name
soup.find_all('a', class_ = 'sister')
|
css
Comment类型
略
1 2 3 4 5
| find_all('a')
head.contents
|