requests模組可以用Python程式發出HTTP的請求,取得指定網站的內容。 (requests模組使用前必須先安排,在Anaconda中已內建)
練習網頁
語法
import request
Response 物件 = requests.get(網址)
物件的屬性可以取得不同的回應內容
指定request讀取網頁時的編碼(UTF-8),預設為ISO-8859-1(又稱Latin-1), 會造成文字亂碼
# 基礎爬蟲
# 發出要求取得原始資料
import requests
url = "http://liangyuh.neocities.org/python/demo1.html"
html = requests.get(url) #向網站提出Get請求
html.encoding = "utf-8" #指定request讀取網頁時的編碼(UTF-8),預設為ISO-8859-1
print(html.text) #物件.text 取得網頁原始碼資料
網頁的內容其實是純文字,網頁是使用HTML語法利用標籤建構內容。
HTML提供了一個文件結構化的表示法DOM(文件物件模型),所有的標籤都由「<...>」包含,大部份都有「開始與結束」標籤
如demo1.html的內容為
<!DOCTYPE html>
<html lang="zh">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>我是網頁標題</title>
</head>
<body>
<h1 class="large">我是標題</h1>
<div>
<p>我是段落</p>
<p><img src="HTML5_Logo_256.png" alt="我是圖片"></p>
<p><a href="http://tw.yahoo.com/">我是超連結</a></p>
</div>
</body>
</html>
BeautifulSoap模組的功能,是將讀取的網頁原始解析為一個個結構化的物件,讓程式能夠快速取得其中的內容。
要先安裝
pip install -U beautifulsoup4
# 解析網頁
from bs4 import BeautifulSoup
# 建立BeautifulSoup型別物件sp, 其中「html.parser」內建的解析器
sp = BeautifulSoup(html.text, 'html.parser')
# sp.body.div.a.text
sp.a.text
常用的屬性有
在HTML中每個標籤都是DOM結構中的結點,使用BeautifulSoup 物件.標籤名稱即可取得該節點中的內容(包含HTML標籤)
在取得的內容加上text屬性,則可去除HTML標籤,取得標籤區域內的文字
# 取得屬性
print(sp.title) # 傳回title標籤內容
print(sp.title.text) # 傳回title內容
print(sp.h1)
print(sp.p)
# 使用get取得物件屬性
sp.a.get('href')
sp.a['href']
sp.img.get('src')
sp.img['src']
加入標籤屬性為搜尋條件,若有多個屬性條件, 則加到後方
find(標籤名稱, 屬性名稱=屬性內容)
find("img", width = 20)
若是屬性為class類別時,因為是保留字,所以要設為 _class=:
sp.find_all("p",class_=:'red')
# 發出原始資料
import requests
from bs4 import BeautifulSoup
url = "http://liangyuh.neocities.org/python/demo2.html"
html = requests.get(url)
html.encoding = "utf-8"
print(html.text)
# 拆解資料
sp = BeautifulSoup(html.text, 'html.parser')
# CSS中id編號是唯一的, 讀取時最明確
# sp.find('p', id='p2').text
# sp.find('li', class_="even").a.text
# 將資料存為串列
datas = sp.find_all('p')
for data in datas:
print(data.text)
datas = sp.find_all('a')
for data in datas:
print(data.get('href'))
使用select()方法就是以CSS選擇器的方式, 尋找資料, 回傳值是串列
import requests
from bs4 import BeautifulSoup
url = "http://liangyuh.neocities.org/python/demo2.html"
html = requests.get(url)
html.encoding = "utf-8"
#print(html)
print(html.text)
sp = BeautifulSoup(html.text,"html.parser")
print(sp.select('title'))
print(sp.select('p'))
print(sp.select('#p1'))
print(sp.select('.even'))
import requests
from bs4 import BeautifulSoup
url = "https://www.ptt.cc/bbs/Food/index.html"
html = requests.get(url)
sp = BeautifulSoup(html.text, 'html.parser')
datas = sp.find_all('div', class_="r-ent")
# datas
for data in datas:
print(data.find('div', class_='date').text, end=" ")
print("https://www.ptt.cc/" + data.a.get("href"), end=" ")
print(data.a.text)
import requests
from bs4 import BeautifulSoup
url = "https://www.ptt.cc/bbs/Food/index7001.html"
html = requests.get(url)
sp = BeautifulSoup(html.text, 'html.parser')
datas = sp.find_all('div', class_="r-ent")
for data in datas:
print(data.find('div', class_='date').text, end=" ")
print("https://www.ptt.cc/" + data.a.get("href"), end=" ")
print(data.a.text)
"https://www.ptt.cc/" + sp.find_all('a', class_="btn wide")[1].get('href')
import requests
from bs4 import BeautifulSoup
# 設定首頁連結
url = "https://www.ptt.cc/bbs/Food/index.html"
# 設定抓取的頁數
for i in range(5):
print("第 {} 頁".format(i+1))
html = requests.get(url)
sp = BeautifulSoup(html.text, 'html.parser')
datas = sp.find_all('div', class_="r-ent")
for data in datas:
print(data.find('div', class_='date').text, end=" ")
print("https://www.ptt.cc/" + data.a.get("href"), end=" ")
print(data.a.text)
# 取得下一頁的連結
url = "https://www.ptt.cc/" + sp.find_all('a', class_="btn wide")[1].get('href')
import requests
from bs4 import BeautifulSoup
url = "https://www.ptt.cc/bbs/Gossiping/index.html"
# 設定cookie值
headers = {"cookie":"over18=1"}
html = requests.get(url, headers = headers)
sp = BeautifulSoup(html.text, 'html.parser')
datas = sp.find_all('div', class_="r-ent")
for data in datas:
print(data.find('div', class_='date').text, end=" ")
print("https://www.ptt.cc/" + data.a.get("href"), end=" ")
print(data.a.text)
import requests # 查http 狀態碼 200, 404
url="http://ehappy.tw/bsdemo1.htm"
html = requests.get(url)
html.encoding = "utf-8" # 沒有設網頁編碼為 utf-8, 會出現亂碼
print(html.text)
from bs4 import BeautifulSoup # tab
sp = BeautifulSoup(html.text,"html.parser")
# print(sp)
# print(sp.title)
# print(sp.h1)
# print(sp.p)
print(sp.title)
print(sp.title.text)
# 取得屬性值1find()
print(sp.find("h1"))
print(sp.find("h1").text)
print(sp.find("p"))
print(sp.find("p").text)
print(sp.find("a")) # a 超連結
print(sp.find("a").text)
# 取得屬性值1-- get()
print(sp.find("a")) # 超連結
print(sp.find("a").get('href'))
print(sp.find("img")) # 圖片
print(sp.find("img").get('src'))
# 取得屬性值2-- []
print(sp.find("a")) # 超連結
print(sp.find("a")['href'])
print(sp.find("img")) # 圖片
print(sp.find("img")['src'])
import requests # 查http 狀態碼 200, 404
from bs4 import BeautifulSoup
url="http://ehappy.tw/bsdemo2.htm"
html = requests.get(url)
html.encoding = "utf-8" # 沒有設網頁編碼為 utf-8, 會出現亂碼
sp = BeautifulSoup(html.text,"html.parser")
sp.title.text
# sp.find('p').text
# sp.find('a').get("href")
sp.find('p', id="p2").text #二種屬性值判斷取值
sp.find('p', style="font-size:16pt").text # 二種屬性值判斷取值
sp.find('li', class_="even") # class是保留字, 必須加底線_
sp.find('li', class_="even").find('a').get('href') # class是保留字, 必須加底線_
# find_all() 取全部
datas = sp.find_all('a')
type(datas)
for data in datas:
print(data.text)
print(data.get('href'))
# PTT 美食版爬蟲 不要一直抓, 抓一次繼續練習
import requests
from bs4 import BeautifulSoup
url = "https://www.ptt.cc/bbs/Food/index.html"
html = requests.get(url)
sp = BeautifulSoup(html.text, "html.parser")
sp.title.text
# 使用Chrome / Elements來看結構
# find_all() 取全部
datas = sp.find_all('div', class_ = "r-ent")
for data in datas: # 可以使用 end=""
print(data.find('div', class_='date').text )
# print(data.find('div', class_='title').find('a').get('href'))
print('http://www.ptt.cc'+data.find('div', class_='title').find('a').get('href')) #加上超連結
print(data.find('div', class_='title').find('a').text )
# 台灣彩券很好用不怕人爬, 不要用台灣高鐵會封鎖
# 只爬威力彩 , 按右鍵檢查
import requests
from bs4 import BeautifulSoup
url = "https://www.taiwanlottery.com.tw/index_new.aspx"
html = requests.get(url)
sp = BeautifulSoup(html.text,'html.parser') # 拆解格式
sp.title
datas = sp.find('div',class_="contents_box02") # 找到的第1筆
print(datas.find('div',class_="contents_mine_tx02").find('span',class_="font_black15").text)
nums = datas.find_all('div',class_='ball_tx ball_green')
# nums
print('開出順序: ')
for i in range(0,6):
print(nums[i].text, end=' ')
print('\n大小順序: ')
for i in range(6,12):
print(nums[i].text, end=' ')
print('\n特別號: ', datas.find('div',class_="ball_red").text)
# 抓大樂透
# datas = sp.find_all('div',class_="contents_box02") # 找出串列0,1,2,3
# datas[2] #第3區
# 可以用
datas = sp.find_all('div',class_="contents_box02")[2] # 直接指定串列第3區
datas
print(datas.find('div',class_="contents_mine_tx02").find('span',class_="font_black15").text)
nums = datas.find_all('div',class_='ball_tx ball_yellow')
# nums
print('開出順序: ')
for i in range(0,6):
print(nums[i].text, end=' ')
print('\n大小順序: ')
for i in range(6,12):
print(nums[i].text, end=' ')
print('\n特別號: ', datas.find('div',class_="ball_red").text)
# LINE 貼圖批次下載
import requests
from bs4 import BeautifulSoup
url = 'https://store.line.me/stickershop/product/15830473/zh-Hant'
html = requests.get(url)
sp = BeautifulSoup(html.text,'html.parser')
sp.title
# len(sp.find_all('li',class_='mdCMN09Li FnStickerPreviewItem')) # 先測有沒有爬到資料, 測長度
datas = sp.find_all('li',class_='mdCMN09Li FnStickerPreviewItem')
# datas[0]
datas[0].get('data-preview')
# 介紹字典的應用, 用字典取圖 {鍵:值,鍵,}
import json
imginfo = json.loads(datas[2].get('data-preview')) # 第幾張圖
# type(imginfo)
print(imginfo['staticUrl'])
# 把所以圖弄出來
import json
for data in datas:
imginfo = json.loads(data.get('data-preview'))
print(imginfo['staticUrl'])
# open() 下載遠端圖檔
import requests
url = 'https://stickershop.line-scdn.net/stickershop/v1/sticker/412149504/android/sticker.png;compress=true'
img = requests.get(url)
with open('test123.png','wb') as f:
f.write(img.content)
import requests
import json
for data in datas:
imginfo = json.loads(data.get('data-preview'))
imgurl = imginfo['staticUrl']
imgid = imginfo['id']
imgfile = requests.get(imgurl)
with open(imgid + '.png','wb') as f:
f.write(imgfile.content)
import requests
from bs4 import BeautifulSoup
url = "https://www.irasutoya.com/2021/01/blog-post_64.html"
html = requests.get(url)
sp = BeautifulSoup(html.text, 'html.parser')
datas = sp.find('div', class_="separator").find_all('a')
for data in datas:
print(data.get('href'))
sp = BeautifulSoup(html.text, 'html.parser')
datas = sp.find('div', class_="separator").find_all('a')
for data in datas:
print(data.get('href').split('/')[-1])
sp = BeautifulSoup(html.text, 'html.parser')
datas = sp.find('div', class_="separator").find_all('a')
for data in datas:
imgfile = requests.get(data.get('href'))
imgname = data.get('href').split('/')[-1]
with open('/content/drive/MyDrive/Colab Notebooks/mouseimg/' + imgname, 'wb') as f:
f.write(imgfile.content)
# 萌典
import requests, json
url = 'https://www.moedict.tw/uni/%E8%90%8C'
raw = requests.get(url)
datas = json.loads(raw.text)
type(datas)
print(datas['title'])
print(datas['radical'])
print(datas['stroke_count'])
print(datas['heteronyms'][0]['bopomofo'])
# 國語小字典
import requests, json
word = input('請輸入您要查的生字: ')
url = 'https://www.moedict.tw/uni/' + word # 要加國字
raw = requests.get(url)
datas = json.loads(raw.text)
print('您查詢的國字為:{}、注音為{}、部首為{}、筆畫是{}筆'
.format(datas['title'],
datas['heteronyms'][0]['bopomofo'],
datas['radical'],
datas['stroke_count'])) # 一行切成多行
import requests
url = 'https://covid19dashboard.cdc.gov.tw/dash3'
html = requests.get(url)
html.text
import json
datas = json.loads(html.text)
for k, v in datas["0"].items():
print(k, v)