- 크롤링(Crawling): 인터넷의 데이터를 활용하기 위해 정보들을 분석하고 활용할 수 있도록 수집하는 행위
- 스크레이핑(Scraping): 크롤링 + 데이터를 추출해서 가공하는 최종 목표
1. Basic English Speaking
import requests
from bs4 import BeautifulSoup as bs
request = requests.get(site)
print(request)
# print(request.text)
soup = BeautifulSoup(request.text)
divs = soup.find('div', {'class':'thrv-columns'})
divs
links = divs.findAll('a')
print(links)
for link in links:
print(link.text)
subject = []
for link in links:
subject.append(link.text)
print(subject)
print('총', len(subject),'개의 주제를 찾았습니다')
for i in range(len(subject)):
print('{0:2d}, {1:s}'.format(i+1, subject[i]))
2. 다음 뉴스기사
def daum_news_title(news_id):
request = requests.get(url)
soup = bs(request.text)
title = soup.find('h3',{'class':'tit_view'})
if title:
return title.text.strip()
return '제목없음'
daum_news_title(20230601093610464)
daum_news_title(20230601072518640)
3. 벅스 뮤직 차트
titles = soup.findAll('p',{'class':'title'})
artists = soup.findAll('p',{'class':'artist'})
for i,(t, a) in enumerate(zip(titles, artists)):
title = t.text.strip()
artist = a.text.strip().split('\n')[0]
print('{0:3d}위 {1} - {2}'.format(i+1, artist, title))
4. 멜론 차트
print(request) # <Response [406]>
# User-Agent:
# Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}
print(request)
soup = bs(request.text)
titles = soup.findAll('div',{'class':'rank01'})
artists = soup.findAll('span',{'class':'checkEllipsis'})
for i,(t, a) in enumerate(zip(titles, artists)):
title = t.text.strip()
artist = a.text.strip().split('\n')[0]
print('{0:3d}위 {1} - {2}'.format(i+1, title, artist))
5. 네이버 금융
request = requests.get(site)
soup = bs(request.text)
div_today = soup.find('div',{'class':'today'})
print(div_today)
em = div_today.find('em')
print(em)
price = em.find('span',{'class':'blind'})
print(print(price.text))
def naver_finance(code):
request = requests.get(site)
soup = bs(request.text)
div_today = soup.find('div', {'class':'today'})
em = div_today.find('em')
price = em.find('span', {'class':'blind'}).text # 가격
wrap_company = soup.find('div', {'class':'wrap_company'})
name = wrap_company.a.text # 회사명
div_description = wrap_company.find('div', {'class':'description'})
code = div_description.span.text # 코드
table_no_info = soup.find('table', {'class':'no_info'})
tds = table_no_info.find_all('td')
volume = tds[2].find('span', {'class':'blind'}).text # 거래량
dic = {'price':price, 'name':name, 'code':code, 'volume':volume}
return dic
data = []
for code in codes:
dic = naver_finance(code)
data.append(dic)
print(data)
import pandas as pd
df = pd.DataFrame(data)
df
df.to_excel('naver_finance.xlsx')
# 지니 차트 200위 까지 크롤링
# 반복문을 사용하여 크롤링
# 수집한 데이터를 데이터프레임으로 만들고
# 엑셀로 출력
# 과제는 깃허브에 제출
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
data = []
num = 1
for j in range(1,5):
header = {"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
request = requests.get(site, headers=header)
soup = bs(request.text)
tds = soup.findAll('td', {'class':'info'})
for i in range(len(tds)):
title = tds[i].find('a', {'class':'title ellipsis'}).text.strip()
artist = tds[i].find('a', {'class':'artist ellipsis'}).text.strip()
print('{0:3d}위 {1} - {2}'.format(num, title, artist))
dic = {'순위':num, '타이틀':title, '가수':artist}
data.append(dic)
num += 1
df = pd.DataFrame(data)
df.to_excel('genie_music.xlsx')
header = {"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
request = requests.get(site, headers=header)
soup = bs(request.text)
tds = soup.findAll('td', {'class':'info'})
for i in range(len(tds)):
title = tds[i].find('a', {'class':'title ellipsis'}).text.strip()
artist = tds[i].find('a', {'class':'artist ellipsis'}).text.strip()
print('{0:3d}위 {1} - {2}'.format(i+1, title, artist))