# 基礎爬蟲
# 發出要求取得原始資料
import requests
url = "http://liangyuh.neocities.org/python/demo1.html"
html = requests.get(url)    #向網站提出Get請求
html.encoding = "utf-8" #指定request讀取網頁時的編碼(UTF-8)，預設為ISO-8859-1
print(html.text)    #物件.text 取得網頁原始碼資料


# 解析網頁
from bs4 import BeautifulSoup
# 建立BeautifulSoup型別物件sp, 其中「html.parser」內建的解析器 
sp = BeautifulSoup(html.text, 'html.parser') 
# sp.body.div.a.text
sp.a.text


# 取得屬性
print(sp.title) # 傳回title標籤內容
print(sp.title.text) # 傳回title內容
print(sp.h1)
print(sp.p)


# 使用get取得物件屬性
sp.a.get('href')
sp.a['href']
sp.img.get('src')
sp.img['src']


# 發出原始資料
import requests
from bs4 import BeautifulSoup
url = "http://liangyuh.neocities.org/python/demo2.html"
html = requests.get(url)
html.encoding = "utf-8"
print(html.text)


# 拆解資料
sp = BeautifulSoup(html.text, 'html.parser')
# CSS中id編號是唯一的, 讀取時最明確
# sp.find('p', id='p2').text
# sp.find('li', class_="even").a.text
# 將資料存為串列
datas = sp.find_all('p')
for data in datas:
    print(data.text)

datas = sp.find_all('a')
for data in datas:
    print(data.get('href'))


import requests
from bs4 import BeautifulSoup
url = "http://liangyuh.neocities.org/python/demo2.html"
html = requests.get(url)
html.encoding = "utf-8"
#print(html)
print(html.text)


sp = BeautifulSoup(html.text,"html.parser")
print(sp.select('title'))
print(sp.select('p'))
print(sp.select('#p1'))
print(sp.select('.even'))


import requests
from bs4 import BeautifulSoup
url = "https://www.ptt.cc/bbs/Food/index.html"
html = requests.get(url)


sp = BeautifulSoup(html.text, 'html.parser')
datas = sp.find_all('div', class_="r-ent")
# datas
for data in datas:
    print(data.find('div', class_='date').text, end=" ")
    print("https://www.ptt.cc/" + data.a.get("href"), end=" ")
    print(data.a.text)


import requests
from bs4 import BeautifulSoup

url = "https://www.ptt.cc/bbs/Food/index7001.html"
html = requests.get(url)
sp = BeautifulSoup(html.text, 'html.parser')
datas = sp.find_all('div', class_="r-ent")

for data in datas:
    print(data.find('div', class_='date').text, end=" ")
    print("https://www.ptt.cc/" + data.a.get("href"), end=" ")
    print(data.a.text)


"https://www.ptt.cc/" + sp.find_all('a', class_="btn wide")[1].get('href')


import requests
from bs4 import BeautifulSoup
# 設定首頁連結
url = "https://www.ptt.cc/bbs/Food/index.html"
# 設定抓取的頁數
for i in range(5):
    print("第 {} 頁".format(i+1))
    html = requests.get(url)
    sp = BeautifulSoup(html.text, 'html.parser')
    datas = sp.find_all('div', class_="r-ent")

    for data in datas:
        print(data.find('div', class_='date').text, end=" ")
        print("https://www.ptt.cc/" + data.a.get("href"), end=" ")
        print(data.a.text)
    # 取得下一頁的連結
    url = "https://www.ptt.cc/" + sp.find_all('a', class_="btn wide")[1].get('href')


import requests
from bs4 import BeautifulSoup

url = "https://www.ptt.cc/bbs/Gossiping/index.html"
# 設定cookie值
headers = {"cookie":"over18=1"}
html = requests.get(url, headers = headers)
sp = BeautifulSoup(html.text, 'html.parser')
datas = sp.find_all('div', class_="r-ent")

for data in datas:
    print(data.find('div', class_='date').text, end=" ")
    print("https://www.ptt.cc/" + data.a.get("href"), end=" ")
    print(data.a.text)


import requests  # 查http 狀態碼 200, 404
url="http://ehappy.tw/bsdemo1.htm"
html = requests.get(url)
html.encoding = "utf-8" # 沒有設網頁編碼為 utf-8, 會出現亂碼
print(html.text)


from bs4 import BeautifulSoup # tab
sp = BeautifulSoup(html.text,"html.parser")
# print(sp)
# print(sp.title)
# print(sp.h1)
# print(sp.p)
print(sp.title)
print(sp.title.text)


# 取得屬性值1find()
print(sp.find("h1"))
print(sp.find("h1").text)
print(sp.find("p"))
print(sp.find("p").text)
print(sp.find("a"))  # a 超連結
print(sp.find("a").text)


# 取得屬性值1-- get()
print(sp.find("a")) # 超連結
print(sp.find("a").get('href'))
print(sp.find("img")) # 圖片
print(sp.find("img").get('src'))


# 取得屬性值2-- []
print(sp.find("a")) # 超連結
print(sp.find("a")['href'])
print(sp.find("img")) # 圖片
print(sp.find("img")['src'])


import requests  # 查http 狀態碼 200, 404
from bs4 import BeautifulSoup
url="http://ehappy.tw/bsdemo2.htm"
html = requests.get(url)
html.encoding = "utf-8" # 沒有設網頁編碼為 utf-8, 會出現亂碼
sp = BeautifulSoup(html.text,"html.parser")
sp.title.text


# sp.find('p').text
# sp.find('a').get("href")
sp.find('p', id="p2").text #二種屬性值判斷取值
sp.find('p', style="font-size:16pt").text  # 二種屬性值判斷取值
sp.find('li', class_="even") # class是保留字, 必須加底線_
sp.find('li', class_="even").find('a').get('href') # class是保留字, 必須加底線_


# find_all() 取全部
datas = sp.find_all('a')
type(datas)
for data in datas:
    print(data.text)
    print(data.get('href'))


# PTT 美食版爬蟲 不要一直抓, 抓一次繼續練習
import requests
from bs4 import BeautifulSoup
url = "https://www.ptt.cc/bbs/Food/index.html"
html = requests.get(url)
sp = BeautifulSoup(html.text, "html.parser")
sp.title.text


# 使用Chrome / Elements來看結構
# find_all() 取全部
datas = sp.find_all('div', class_ = "r-ent")
for data in datas:  # 可以使用 end=""
    print(data.find('div', class_='date').text ) 
    # print(data.find('div', class_='title').find('a').get('href'))
    print('http://www.ptt.cc'+data.find('div', class_='title').find('a').get('href')) #加上超連結
    print(data.find('div', class_='title').find('a').text )


# 台灣彩券很好用不怕人爬, 不要用台灣高鐵會封鎖
# 只爬威力彩 , 按右鍵檢查
import requests
from bs4 import BeautifulSoup
url = "https://www.taiwanlottery.com.tw/index_new.aspx"
html = requests.get(url)
sp = BeautifulSoup(html.text,'html.parser') # 拆解格式 
sp.title


datas = sp.find('div',class_="contents_box02") # 找到的第1筆
print(datas.find('div',class_="contents_mine_tx02").find('span',class_="font_black15").text)
nums = datas.find_all('div',class_='ball_tx ball_green')
# nums
print('開出順序: ')
for  i in range(0,6):
    print(nums[i].text, end=' ')
print('\n大小順序: ')
for  i in range(6,12):
    print(nums[i].text, end=' ')
print('\n特別號: ', datas.find('div',class_="ball_red").text)


# 抓大樂透
# datas = sp.find_all('div',class_="contents_box02") # 找出串列0,1,2,3
# datas[2] #第3區
# 可以用
datas = sp.find_all('div',class_="contents_box02")[2] # 直接指定串列第3區
datas
print(datas.find('div',class_="contents_mine_tx02").find('span',class_="font_black15").text)
nums = datas.find_all('div',class_='ball_tx ball_yellow')
# nums
print('開出順序: ')
for  i in range(0,6):
    print(nums[i].text, end=' ')
print('\n大小順序: ')
for  i in range(6,12):
    print(nums[i].text, end=' ')
print('\n特別號: ', datas.find('div',class_="ball_red").text)


# LINE 貼圖批次下載
import requests
from bs4 import BeautifulSoup
url = 'https://store.line.me/stickershop/product/15830473/zh-Hant'
html = requests.get(url)
sp = BeautifulSoup(html.text,'html.parser')
sp.title


# len(sp.find_all('li',class_='mdCMN09Li FnStickerPreviewItem')) # 先測有沒有爬到資料, 測長度
datas = sp.find_all('li',class_='mdCMN09Li FnStickerPreviewItem')
# datas[0]
datas[0].get('data-preview')
# 介紹字典的應用, 用字典取圖 {鍵:值,鍵,}


import json
imginfo = json.loads(datas[2].get('data-preview')) # 第幾張圖
# type(imginfo)
print(imginfo['staticUrl'])


# 把所以圖弄出來
import json
for data in  datas:
    imginfo = json.loads(data.get('data-preview'))
    print(imginfo['staticUrl'])


# open() 下載遠端圖檔
import requests
url = 'https://stickershop.line-scdn.net/stickershop/v1/sticker/412149504/android/sticker.png;compress=true'
img = requests.get(url)
with open('test123.png','wb') as f:
    f.write(img.content)


import requests
import json
for data in  datas:
    imginfo = json.loads(data.get('data-preview'))
    imgurl = imginfo['staticUrl']
    imgid  = imginfo['id']
    imgfile = requests.get(imgurl)
    with open(imgid + '.png','wb') as f:
        f.write(imgfile.content)


import requests
from bs4 import BeautifulSoup
url = "https://www.irasutoya.com/2021/01/blog-post_64.html"
html = requests.get(url)


sp = BeautifulSoup(html.text, 'html.parser')
datas = sp.find('div', class_="separator").find_all('a')
for data in datas:
    print(data.get('href'))


sp = BeautifulSoup(html.text, 'html.parser')
datas = sp.find('div', class_="separator").find_all('a')
for data in datas:
    print(data.get('href').split('/')[-1])


sp = BeautifulSoup(html.text, 'html.parser')
datas = sp.find('div', class_="separator").find_all('a')
for data in datas:
    imgfile = requests.get(data.get('href'))
    imgname = data.get('href').split('/')[-1]
    with open('/content/drive/MyDrive/Colab Notebooks/mouseimg/' + imgname, 'wb') as f:
        f.write(imgfile.content)


# 萌典
import requests, json
url = 'https://www.moedict.tw/uni/%E8%90%8C'
raw = requests.get(url)
datas = json.loads(raw.text)
type(datas)


print(datas['title'])
print(datas['radical']) 
print(datas['stroke_count']) 
print(datas['heteronyms'][0]['bopomofo'])


# 國語小字典
import requests, json
word = input('請輸入您要查的生字: ')
url = 'https://www.moedict.tw/uni/' + word  # 要加國字
raw = requests.get(url)
datas = json.loads(raw.text)
print('您查詢的國字為:{}、注音為{}、部首為{}、筆畫是{}筆'
      .format(datas['title'],
              datas['heteronyms'][0]['bopomofo'],
              datas['radical'],
              datas['stroke_count']))  # 一行切成多行


import requests
url = 'https://covid19dashboard.cdc.gov.tw/dash3'
html = requests.get(url)
html.text


import json
datas = json.loads(html.text)
for k, v in datas["0"].items():
    print(k, v)

網路爬蟲：數據資料的爬取¶

Python資料科學¶

1、request模組: 讀取網站資料¶

1-1、發送GET請求¶

1-2、認識網頁架構¶

2、BeautifulSoup模組:網頁解析¶

2-1、BeautifulSoup常用的屬性¶

2-2、BeautifulSoup常用的方法¶

2-3、利用CSS選擇器找尋內容: select() (*)¶

3、實作-PTT美食版爬蟲¶

4、實作-威力彩¶

4-2、延伸練習-大樂透¶

5、批次下載LINE貼圖¶

延伸練習-批次下載網路圖片¶

6、實作-萌典應用¶

7、實作-非同步載入資料 Ajax 爬蟲¶

實作: COVID-19資料¶