主要內容
_抓取YAHOO股市所有類別
_下載所有大樂透資料
_抓取YAHOO股市所有類別
原本以為分頁是按照順序的,結果卻不是
所以先抓取所有類別的網址
之後再分別抓取裡面的資料
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
import requests from bs4 import BeautifulSoup import time htmlUrl = requests.get("https://tw.finance.yahoo.com/class-quote?sectorId=1&exchange=TAI") urlText=BeautifulSoup(htmlUrl.text, "html.parser").find_all("a","Pstart(16px)") urlArry=[] for i in range(0,len(urlText)): urlArry.append(urlText[i].get("href")) print(urlArry) s="股票名稱,代號,股價,漲跌,漲跌幅(%),開盤,昨收,最高,最低,成交量(張),時間\n" for p in range(0,len(urlArry)):#並不是1-52 url = "https://tw.finance.yahoo.com"+urlArry[p] #Yahoo股市 #print(url) html = requests.session().get(url) #print(html.text) #下載網頁 sp = BeautifulSoup(html.text, "html.parser") # data = sp.find_all("div","Bdbc($bd-primary-divider)") # print(data[0].text) # print(len(data)) for i in range(0,len(data)): #print(data[i].text) #股票名稱 d1=data[i].find("div","Ell") #print(d1.text) #代號 d2=data[i].find("div","D(f) Ai(c)") #print(d2.text) s += d1.text + "," + d2.text #股價 d3=data[i].find_all("div","Mend(0):lc") for j in range(0,len(d3)): #int(d3[j].text) s += ","+d3[j].text.replace(",","") s += "\n" s += "------------\n" time.sleep(1) #print(s) #print(s) f=open("0920-13.csv","w") f.write(s) f.close() print("--完成--") |
_下載所有大樂透資料
資料來源:樂透雲
範例是存成csv檔
練習改存成excel檔
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
import requests from bs4 import BeautifulSoup # s="日期,1,2,3,4,5,6,特別號,備註\n" colum=["日期",1,2,3,4,5,6,"特別號","備註"] index=[] data=[] j=1 for p in range(1,11): #取前10頁 url = "https://www.lotto-8.com/listltobigbbk.asp?indexpage="+str(p)+"&orderby=new" html = requests.get(url) html.encoding="utf-8" sp = BeautifulSoup(html.text, "html.parser") sp1=sp.find("table","auto-style4") sp2=sp1.find_all("tr") #print(sp2[5].text.replace("\xa0","")) for i in range(1,len(sp2)): sp3=sp2[i].find_all("td") d1=sp3[0].text d2=sp3[1].text.replace("\xa0","") d3=sp3[2].text d4=sp3[3].text # print(d2) d2Array = d2.split(",") # print(d2Array) d2Array= list(map(int, d2Array)) data.append([d1,d2Array[0],d2Array[1],d2Array[2],d2Array[3],d2Array[4],d2Array[5],int(d3),d4]) index.append(j) j+=1 #print(data) # s +="{},{},{}\n".format(d1,d2,d3) # for i in range(4,len(sp2),4): # s +="{},{},{}\n".format(sp2[i].text,sp2[i+1].text.replace("\xa0",""),sp2[i+2].text) # print(s) # f=open("0920-22.csv","w") # f.write(s) # f.close() print(data) import pandas as pd df=pd.DataFrame(data,index,colum) df.to_excel("0920-23.xlsx") |