Py / Python網頁資料擷取與分析班-筆記 9/20

主要內容

_抓取YAHOO股市所有類別

_下載所有大樂透資料


_抓取YAHOO股市所有類別

原本以為分頁是按照順序的,結果卻不是

所以先抓取所有類別的網址

之後再分別抓取裡面的資料

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import requests
from bs4 import BeautifulSoup
import time

htmlUrl = requests.get("https://tw.finance.yahoo.com/class-quote?sectorId=1&exchange=TAI")
urlText=BeautifulSoup(htmlUrl.text, "html.parser").find_all("a","Pstart(16px)")
urlArry=[]
for i in range(0,len(urlText)):
    urlArry.append(urlText[i].get("href"))
print(urlArry)
s="股票名稱,代號,股價,漲跌,漲跌幅(%),開盤,昨收,最高,最低,成交量(張),時間\n"
for p in range(0,len(urlArry)):#並不是1-52
    url = "https://tw.finance.yahoo.com"+urlArry[p] #Yahoo股市
    #print(url)
    html = requests.session().get(url)
    #print(html.text)
    #下載網頁
    sp = BeautifulSoup(html.text, "html.parser")
    #
    data = sp.find_all("div","Bdbc($bd-primary-divider)") 
    # print(data[0].text)
    # print(len(data))
    

    for i in range(0,len(data)):
        #print(data[i].text)
        #股票名稱
        d1=data[i].find("div","Ell")
        #print(d1.text)
    
        #代號
        d2=data[i].find("div","D(f) Ai(c)")
        #print(d2.text)
        s += d1.text + "," + d2.text
        #股價
        d3=data[i].find_all("div","Mend(0):lc")
        for j in range(0,len(d3)):
            #int(d3[j].text)
            s += ","+d3[j].text.replace(",","")
        s += "\n"
    s += "------------\n"
    time.sleep(1)
    #print(s)
#print(s)
f=open("0920-13.csv","w")
f.write(s)
f.close()
print("--完成--")

 

_下載所有大樂透資料

資料來源:樂透雲

範例是存成csv檔

練習改存成excel檔

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import requests
from bs4 import BeautifulSoup
# s="日期,1,2,3,4,5,6,特別號,備註\n"
colum=["日期",1,2,3,4,5,6,"特別號","備註"]
index=[]
data=[]
j=1
for p in range(1,11): #取前10頁
    url = "https://www.lotto-8.com/listltobigbbk.asp?indexpage="+str(p)+"&orderby=new"
    html = requests.get(url)
    html.encoding="utf-8"
    sp = BeautifulSoup(html.text, "html.parser")
    sp1=sp.find("table","auto-style4")
    sp2=sp1.find_all("tr")
    #print(sp2[5].text.replace("\xa0",""))
    
    for i in range(1,len(sp2)):
        sp3=sp2[i].find_all("td")
        
        d1=sp3[0].text
        d2=sp3[1].text.replace("\xa0","")
        d3=sp3[2].text
        d4=sp3[3].text
        
        # print(d2)
        d2Array = d2.split(",")
        # print(d2Array)
        d2Array= list(map(int, d2Array))
        data.append([d1,d2Array[0],d2Array[1],d2Array[2],d2Array[3],d2Array[4],d2Array[5],int(d3),d4])
        index.append(j)
        j+=1
        #print(data)
        # s +="{},{},{}\n".format(d1,d2,d3)
    # for i in range(4,len(sp2),4):
    #     s +="{},{},{}\n".format(sp2[i].text,sp2[i+1].text.replace("\xa0",""),sp2[i+2].text)
    #     print(s)

# f=open("0920-22.csv","w")
# f.write(s)
# f.close()
print(data)

import pandas as pd

df=pd.DataFrame(data,index,colum)

df.to_excel("0920-23.xlsx")