1.數據篩選之BS4模塊

例1:獲取紅牛分公司信息:http://www.redbull.com.cn/about/branch  [規律比較統一]

# 獲取紅牛分公司數據http://www.redbull.com.cn/about/branch
import requests
from bs4 import BeautifulSoup
import re
import pandas
import openpyxl

res = requests.get('http://www.redbull.com.cn/about/branch')
# 先測試是否需要加其餘條件
# print(res.text)
"""
公司名稱
公司地址
公司郵箱
公司電話
<h2>紅牛杭州分公司</h2>
<p class='mapIco'>杭州市上城區慶春路29號遠洋大廈11樓A座</p>
<p class='mailIco'>310009</p>
<p class='telIco'>0571-87045279/7792</p>
"""
# 方式1
# 正則方式
# title_list = re.findall('<h2>(.*?)</h2>', res.text)
# addr_list = re.findall("<p class='mapIco'>(.*?)</p>", res.text)
# email_list = re.findall("<p class='mailIco'>(.*?)</p>", res.text)
# phone_list = re.findall("<p class='telIco'>(.*?)</p>", res.text)
# print(phone_list)
# 一一對應
# # 1.構造字典數據
# data_dict = {
#     "公司名稱": title_list,
#     "公司地址": addr_list,
#     "公司郵箱": email_list,
#     "公司電話": phone_list
# }
# df = pandas.DataFrame(data_dict)
# df.to_excel(r'company.xlsx')

# 方式2 採用bs方式
soup = BeautifulSoup(res.text, 'lxml')
# title_list = soup.find_all(name='h2')
# for title in title_list:
#     print(title.text)
# 列表生成式
title_list = [title.text for title in soup.find_all(name='h2')]
# print(title_list)

# addr_list = soup.find_all(name='p',class_='mapIco')
# for addr in addr_list:
#     print(addr.text)
addr_list = [addr.text for addr in soup.find_all(name='p', class_='mapIco')]
email_list = [email.text for email in soup.find_all(name='p', class_='mailIco')]
phone_list = [phone.text for phone in soup.find_all(name='p', class_='telIco')]

print(len(title_list))
for i in range(40):
    print("""
        "公司名稱": %s,
        "公司地址": %s,
        "公司郵箱": %s,
        "公司電話": %s
    """ %(title_list[i],addr_list[i],email_list[i],phone_list[i])
    )

例2:爬取鏈家數據(數據處理)

import requests
from bs4 import BeautifulSoup

"""
1.研究url規律
    https://sh.lianjia.com/ershoufang/huangpu/
     https://sh.lianjia.com/ershoufang/pudong/
     https://城市首字母縮寫.lianjia.com/房屋類型/區域名稱/
2.上海浦東區二手房
    嘗試着發送請求
       第一種:先拿存儲房屋數據的li標籤
    第二種:直接查找對應的標籤數據
"""
res = requests.get('https://sh.lianjia.com/ershoufang/pudong/')
# print(res.text)

soup = BeautifulSoup(res.text, 'lxml')
# 研究url規律,篩選數據
div_list = soup.find_all(name='div', class_='info')

title_list = [div.find(name='a').text for div in div_list if div.find(name='a')]

link_list = [div.find(name='a').get('href') for div in div_list if div.find(name='a')]

div1_list = soup.find_all(name='div', attrs={"class": 'positionInfo'})
addr_list = [div1.text for div1 in div1_list]
# addr_list = [div1.find('a').text for div1 in div1_list]
# print(addr_list)
# for address in addr_list:
#     res = address.split('-')
#     print(res)
# addr_list1 = [div1.find_all('a')[1].text for div1 in div1_list]
# print(addr_list1)
div2_list = soup.find_all(name='div',attrs={"class":"houseInfo"})
info_list = [ div2.text for div2 in div2_list ]
"""
'1室1廳 | 59平米 | 南 | 精裝 | 中樓層(共14層) | 2010年建 | 板樓'
户型
面積
朝向
裝修
樓層
年代
樓型
"""
hx = [ i.split('|')[0].strip() for i in info_list]
mj = [i.split('|')[1].strip() for i in info_list]
cx = [i.split('|')[2].strip() for i in info_list]
zx = [i.split('|')[3].strip() for i in info_list]
lc = [i.split('|')[4].strip() for i in info_list]
nd = [i.split('|')[5].strip() for i in info_list]
lx = [i.split('|')[-1].strip() for i in info_list]

div3_list = soup.find_all(name='div',attrs={"class":"followInfo"})
gz = [ div3.text for div3 in div3_list ]

div4_list = soup.find_all(name='div',attrs={"class":"totalPrice"})
total_price = [ div4.text for div4 in div4_list ]

div5_list = soup.find_all(name='div',attrs={"class":"unitPrice"})
unit = [ div5.text for div5 in div5_list ]
"""效果"""
import pandas as pd
data_dict = {
    "名稱":title_list,
    "地址": addr_list,
    "户型":hx,
    "面積":mj,
    "朝向":cx,
    "裝修":zx,
    "樓層":lc,
    "年代":nd,
    "樓型":lx,
    "總價":total_price,
    "單價":unit
}
df = pd.DataFrame(data_dict)
df.to_excel(r'鏈家.xlsx')

# 多頁規律
    你只需要研究url特點即可(絕對有規律)
      第一頁:https://sh.lianjia.com/ershoufang/jingan/
    第二頁:https://sh.lianjia.com/ershoufang/jingan/pg2/
    第三頁:https://sh.lianjia.com/ershoufang/jingan/pg3/
    ...
    https://sh.lianjia.com/ershoufang/jingan/pgN/
    '''第一頁應該可以寫成
    https://sh.lianjia.com/ershoufang/jingan/pg1/
    '''
    for i in range(1,100):
    base_url = "https://sh.lianjia.com/ershoufang/jingan/pg%s/"
    print(base_url%i)

例3:爬取天氣數據:網站數據不是一次性獲取

"""
有時候網站的數據不是一次性加載的,內部可能是通過js動態請求
http://tianqi.2345.com/wea_history/58362.htm
有些網站內容編碼查看需要在線json格式化
通過network檢查找內部api接口
虹口
http://tianqi.2345.com/Pc/GetHistory?areaInfo%5BareaId%5D=71451&areaInfo%5BareaType%5D=2&date%5Byear%5D=2020&date%5Bmonth%5D=11
http://tianqi.2345.com/Pc/GetHistory?areaInfo%5BareaId%5D=71451&areaInfo%5BareaType%5D=2&date%5Byear%5D=2020&date%5Bmonth%5D=12
http://tianqi.2345.com/Pc/GetHistory?areaInfo%5BareaId%5D=71451&areaInfo%5BareaType%5D=2&date%5Byear%5D=2021&date%5Bmonth%5D=1

http://tianqi.2345.com/Pc/GetHistory?areaInfo%5BareaId%5D=區域&areaInfo%5BareaType%5D=2&date%5Byear%5D=年份&date%5Bmonth%5D=月份
"""

import requests
import pandas as pd

res = requests.get("http://tianqi.2345.com/Pc/GetHistory?areaInfo%5BareaId%5D=71451&areaInfo%5BareaType%5D=2&date%5Byear%5D=2020&date%5Bmonth%5D=12")
json_dict = res.json()
data = json_dict.get('data')
# 直接獲取網頁table標籤內部所有的數據
res = pd.read_html(data)
res[0].to_excel(r'weather.xlsx')

 例4:爬取汽車之家新聞:排除干擾

import requests
from bs4 import BeautifulSoup

res = requests.get("https://www.autohome.com.cn/news/")
res.encoding = 'GBK'
soup = BeautifulSoup(res.text,'lxml')

ul_ele = soup.find(name='ul',class_="article")
li_list = ul_ele.find_all('li')
# print(li_list)
title_list=[]
link_list=[]
info_list=[]
time_list=[]
num_list=[]
for li in li_list:
    if li.find('a'):
        # 其中有干擾項:<li id="ad_tw_04" style="display: none;"></li>,所以需要if判斷
        link = li.find('a')['href']
        # print('https:'+link)
        link_list.append('https:'+link)

    # 新聞標題 h3
    if li.find('h3'):
        title = li.find('h3').text
        title_list.append(title)

    if li.find('p'):
        info =li.find('p').text
        info_list.append(info)

    # if li.find('span'):
    #     tm = li.find('span').text
    #     time_list.append(tm)
    if li.select('span.fn-left'):
        tm = li.select('span.fn-left')[0].text
        # print(tm)

    if li.select('span.fn-right'):
        num = li.select('span.fn-right')[0].find('em').text
        #評論數是通過計算動態變化的,默認為0,通過js文件找
        # comment = li.select('span.fn-right')[0].find_all('em')
        # print(comment)

例5:基於openpyxl爬取豆瓣數據

# 爬取豆瓣電影top250數據
1.先嚐試着爬取一頁
2.再去研究多頁
    https://movie.douban.com/top250
    https://movie.douban.com/top250?start=25&filter=
    https://movie.douban.com/top250?start=50&filter=
    ...
    # 推導第一頁
    https://movie.douban.com/top250?start=0&filter=
  
import requests
from openpyxl import Workbook
from bs4 import BeautifulSoup
import time


wb = Workbook()
w1 = wb.create_sheet('電影排行榜',index=0)
# 製作表頭字段
w1['A1'] = '序號'
w1['B1'] = '名稱'
w1['C1'] = '連接'
w1['D1'] = '評分'
w1['E1'] = '人數'
# 提前定義一個序號字段
count = 1

for i in range(0,250,25):
    base_url = 'https://movie.douban.com/top250?start=%s&filter='
    url = base_url%i
    res = requests.get(url,
                       # 攜帶請求頭
                       headers={
                           "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36"
                       }
                       )
    soup = BeautifulSoup(res.text,'lxml')
    ol = soup.find(name='ol',class_='grid_view')
    li_list = ol.find_all(name='li')
    for li in li_list:
        count += 1
        title = li.find(name='span').text
        link = li.find(name='a').get('href')
        num = li.select('.rating_num')[0].text
        comment = li.find(name='div',class_='star').find_all('span')[-1].text
        # 寫入數據
        w1['A%s'%count] = count - 1
        w1['B%s'%count] = title
        w1['C%s'%count] = link
        w1['D%s'%count] = num
        w1['E%s'%count] = comment
    # 人為的設置間歇 避免IP封禁
    time.sleep(5)
wb.save(r'movie.xlsx')
"""上述代碼還可以封裝成函數 和 啓動腳本的形式
def get_data(url):
    ...

if __name__ == '__main__':
    for i in range(0,250,25):
        base_url = 'https://movie.douban.com/top250?start=%s&filter='
        url = base_url%i
        get_data(url)
"""

總結

1.先嚐試爬取一頁數據甚至是幾條數據
2.代碼邏輯跑通了之後採取考慮多頁的情況