1.數據篩選之BS4模塊
例1:獲取紅牛分公司信息:http://www.redbull.com.cn/about/branch [規律比較統一]
# 獲取紅牛分公司數據http://www.redbull.com.cn/about/branch
import requests
from bs4 import BeautifulSoup
import re
import pandas
import openpyxl
res = requests.get('http://www.redbull.com.cn/about/branch')
# 先測試是否需要加其餘條件
# print(res.text)
"""
公司名稱
公司地址
公司郵箱
公司電話
<h2>紅牛杭州分公司</h2>
<p class='mapIco'>杭州市上城區慶春路29號遠洋大廈11樓A座</p>
<p class='mailIco'>310009</p>
<p class='telIco'>0571-87045279/7792</p>
"""
# 方式1
# 正則方式
# title_list = re.findall('<h2>(.*?)</h2>', res.text)
# addr_list = re.findall("<p class='mapIco'>(.*?)</p>", res.text)
# email_list = re.findall("<p class='mailIco'>(.*?)</p>", res.text)
# phone_list = re.findall("<p class='telIco'>(.*?)</p>", res.text)
# print(phone_list)
# 一一對應
# # 1.構造字典數據
# data_dict = {
# "公司名稱": title_list,
# "公司地址": addr_list,
# "公司郵箱": email_list,
# "公司電話": phone_list
# }
# df = pandas.DataFrame(data_dict)
# df.to_excel(r'company.xlsx')
# 方式2 採用bs方式
soup = BeautifulSoup(res.text, 'lxml')
# title_list = soup.find_all(name='h2')
# for title in title_list:
# print(title.text)
# 列表生成式
title_list = [title.text for title in soup.find_all(name='h2')]
# print(title_list)
# addr_list = soup.find_all(name='p',class_='mapIco')
# for addr in addr_list:
# print(addr.text)
addr_list = [addr.text for addr in soup.find_all(name='p', class_='mapIco')]
email_list = [email.text for email in soup.find_all(name='p', class_='mailIco')]
phone_list = [phone.text for phone in soup.find_all(name='p', class_='telIco')]
print(len(title_list))
for i in range(40):
print("""
"公司名稱": %s,
"公司地址": %s,
"公司郵箱": %s,
"公司電話": %s
""" %(title_list[i],addr_list[i],email_list[i],phone_list[i])
)
例2:爬取鏈家數據(數據處理)
import requests
from bs4 import BeautifulSoup
"""
1.研究url規律
https://sh.lianjia.com/ershoufang/huangpu/
https://sh.lianjia.com/ershoufang/pudong/
https://城市首字母縮寫.lianjia.com/房屋類型/區域名稱/
2.上海浦東區二手房
嘗試着發送請求
第一種:先拿存儲房屋數據的li標籤
第二種:直接查找對應的標籤數據
"""
res = requests.get('https://sh.lianjia.com/ershoufang/pudong/')
# print(res.text)
soup = BeautifulSoup(res.text, 'lxml')
# 研究url規律,篩選數據
div_list = soup.find_all(name='div', class_='info')
title_list = [div.find(name='a').text for div in div_list if div.find(name='a')]
link_list = [div.find(name='a').get('href') for div in div_list if div.find(name='a')]
div1_list = soup.find_all(name='div', attrs={"class": 'positionInfo'})
addr_list = [div1.text for div1 in div1_list]
# addr_list = [div1.find('a').text for div1 in div1_list]
# print(addr_list)
# for address in addr_list:
# res = address.split('-')
# print(res)
# addr_list1 = [div1.find_all('a')[1].text for div1 in div1_list]
# print(addr_list1)
div2_list = soup.find_all(name='div',attrs={"class":"houseInfo"})
info_list = [ div2.text for div2 in div2_list ]
"""
'1室1廳 | 59平米 | 南 | 精裝 | 中樓層(共14層) | 2010年建 | 板樓'
户型
面積
朝向
裝修
樓層
年代
樓型
"""
hx = [ i.split('|')[0].strip() for i in info_list]
mj = [i.split('|')[1].strip() for i in info_list]
cx = [i.split('|')[2].strip() for i in info_list]
zx = [i.split('|')[3].strip() for i in info_list]
lc = [i.split('|')[4].strip() for i in info_list]
nd = [i.split('|')[5].strip() for i in info_list]
lx = [i.split('|')[-1].strip() for i in info_list]
div3_list = soup.find_all(name='div',attrs={"class":"followInfo"})
gz = [ div3.text for div3 in div3_list ]
div4_list = soup.find_all(name='div',attrs={"class":"totalPrice"})
total_price = [ div4.text for div4 in div4_list ]
div5_list = soup.find_all(name='div',attrs={"class":"unitPrice"})
unit = [ div5.text for div5 in div5_list ]
"""效果"""
import pandas as pd
data_dict = {
"名稱":title_list,
"地址": addr_list,
"户型":hx,
"面積":mj,
"朝向":cx,
"裝修":zx,
"樓層":lc,
"年代":nd,
"樓型":lx,
"總價":total_price,
"單價":unit
}
df = pd.DataFrame(data_dict)
df.to_excel(r'鏈家.xlsx')
# 多頁規律
你只需要研究url特點即可(絕對有規律)
第一頁:https://sh.lianjia.com/ershoufang/jingan/
第二頁:https://sh.lianjia.com/ershoufang/jingan/pg2/
第三頁:https://sh.lianjia.com/ershoufang/jingan/pg3/
...
https://sh.lianjia.com/ershoufang/jingan/pgN/
'''第一頁應該可以寫成
https://sh.lianjia.com/ershoufang/jingan/pg1/
'''
for i in range(1,100):
base_url = "https://sh.lianjia.com/ershoufang/jingan/pg%s/"
print(base_url%i)
例3:爬取天氣數據:網站數據不是一次性獲取
"""
有時候網站的數據不是一次性加載的,內部可能是通過js動態請求
http://tianqi.2345.com/wea_history/58362.htm
有些網站內容編碼查看需要在線json格式化
通過network檢查找內部api接口
虹口
http://tianqi.2345.com/Pc/GetHistory?areaInfo%5BareaId%5D=71451&areaInfo%5BareaType%5D=2&date%5Byear%5D=2020&date%5Bmonth%5D=11
http://tianqi.2345.com/Pc/GetHistory?areaInfo%5BareaId%5D=71451&areaInfo%5BareaType%5D=2&date%5Byear%5D=2020&date%5Bmonth%5D=12
http://tianqi.2345.com/Pc/GetHistory?areaInfo%5BareaId%5D=71451&areaInfo%5BareaType%5D=2&date%5Byear%5D=2021&date%5Bmonth%5D=1
http://tianqi.2345.com/Pc/GetHistory?areaInfo%5BareaId%5D=區域&areaInfo%5BareaType%5D=2&date%5Byear%5D=年份&date%5Bmonth%5D=月份
"""
import requests
import pandas as pd
res = requests.get("http://tianqi.2345.com/Pc/GetHistory?areaInfo%5BareaId%5D=71451&areaInfo%5BareaType%5D=2&date%5Byear%5D=2020&date%5Bmonth%5D=12")
json_dict = res.json()
data = json_dict.get('data')
# 直接獲取網頁table標籤內部所有的數據
res = pd.read_html(data)
res[0].to_excel(r'weather.xlsx')
例4:爬取汽車之家新聞:排除干擾
import requests
from bs4 import BeautifulSoup
res = requests.get("https://www.autohome.com.cn/news/")
res.encoding = 'GBK'
soup = BeautifulSoup(res.text,'lxml')
ul_ele = soup.find(name='ul',class_="article")
li_list = ul_ele.find_all('li')
# print(li_list)
title_list=[]
link_list=[]
info_list=[]
time_list=[]
num_list=[]
for li in li_list:
if li.find('a'):
# 其中有干擾項:<li id="ad_tw_04" style="display: none;"></li>,所以需要if判斷
link = li.find('a')['href']
# print('https:'+link)
link_list.append('https:'+link)
# 新聞標題 h3
if li.find('h3'):
title = li.find('h3').text
title_list.append(title)
if li.find('p'):
info =li.find('p').text
info_list.append(info)
# if li.find('span'):
# tm = li.find('span').text
# time_list.append(tm)
if li.select('span.fn-left'):
tm = li.select('span.fn-left')[0].text
# print(tm)
if li.select('span.fn-right'):
num = li.select('span.fn-right')[0].find('em').text
#評論數是通過計算動態變化的,默認為0,通過js文件找
# comment = li.select('span.fn-right')[0].find_all('em')
# print(comment)
例5:基於openpyxl爬取豆瓣數據
# 爬取豆瓣電影top250數據
1.先嚐試着爬取一頁
2.再去研究多頁
https://movie.douban.com/top250
https://movie.douban.com/top250?start=25&filter=
https://movie.douban.com/top250?start=50&filter=
...
# 推導第一頁
https://movie.douban.com/top250?start=0&filter=
import requests
from openpyxl import Workbook
from bs4 import BeautifulSoup
import time
wb = Workbook()
w1 = wb.create_sheet('電影排行榜',index=0)
# 製作表頭字段
w1['A1'] = '序號'
w1['B1'] = '名稱'
w1['C1'] = '連接'
w1['D1'] = '評分'
w1['E1'] = '人數'
# 提前定義一個序號字段
count = 1
for i in range(0,250,25):
base_url = 'https://movie.douban.com/top250?start=%s&filter='
url = base_url%i
res = requests.get(url,
# 攜帶請求頭
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36"
}
)
soup = BeautifulSoup(res.text,'lxml')
ol = soup.find(name='ol',class_='grid_view')
li_list = ol.find_all(name='li')
for li in li_list:
count += 1
title = li.find(name='span').text
link = li.find(name='a').get('href')
num = li.select('.rating_num')[0].text
comment = li.find(name='div',class_='star').find_all('span')[-1].text
# 寫入數據
w1['A%s'%count] = count - 1
w1['B%s'%count] = title
w1['C%s'%count] = link
w1['D%s'%count] = num
w1['E%s'%count] = comment
# 人為的設置間歇 避免IP封禁
time.sleep(5)
wb.save(r'movie.xlsx')
"""上述代碼還可以封裝成函數 和 啓動腳本的形式
def get_data(url):
...
if __name__ == '__main__':
for i in range(0,250,25):
base_url = 'https://movie.douban.com/top250?start=%s&filter='
url = base_url%i
get_data(url)
"""
總結
1.先嚐試爬取一頁數據甚至是幾條數據
2.代碼邏輯跑通了之後採取考慮多頁的情況
本文章為轉載內容,我們尊重原作者對文章享有的著作權。如有內容錯誤或侵權問題,歡迎原作者聯繫我們進行內容更正或刪除文章。