1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
| #!/usr/bin/python
# -*- coding: UTF-8 -*-
"""
@author:soulchild
@file:bs4_learn.py
@time:2020/11/16
"""
from bs4 import BeautifulSoup
import requests
if __name__ == '__main__':
# 一、加载本地html文件
# fp = open('./xxx.html', 'r', encoding='utf-8')
# soup = BeautifulSoup(fp, 'lxml')
# 二、加载网络资源
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"}
resp = requests.get('https://soulchild.cn', headers=headers).content.decode('utf-8')
soup = BeautifulSoup(resp, 'lxml')
# 查找标签##################################################################################
# 打印第一个a标签
print(soup.a) # soup.xxx xxx代表html标签名
# 按照属性查找
print(soup.find('a', class_='current'))
print(soup.find('a', id='logo'))
print(soup.find('a', href='https://soulchild.cn/'))
# 查找所有a标签
print(soup.find_all('a'))
# 使用html选择器来选择标签
print('---------', soup.select('#logo'))
print('---------', soup.select('.col-mb-12 h2'))
#获取内容##################################################################################
# 获取标签中的文本数据
print('*' * 50)
print(soup.find('a', class_='current').get_text())
print(soup.find('a', class_='current').text)
print(soup.find('a', class_='current').string) # 只能获取标签本身的文本数据,子标签的不会获取
# 获取标签的属性值
print(soup.find('a', class_='current')['href'])
|