크롤링

(4) 픽사베이 이미지 크롤링

빠스무 2023. 7. 9. 16:53
728x90

1. 이미지 수집하기

 

import chromedriver_autoinstaller
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from urllib.request import Request, urlopen
 
driver = webdriver.Chrome()
driver.implicitly_wait(3)

driver.get(url)
time.sleep(3)
 
image_xpath = '/html/body/div[1]/div[1]/div/div[2]/div[3]/div/div/div[4]/div[3]/div/a/img'
image_url = driver.find_element(By.XPATH, image_xpath).get_attribute('src')
print(image_url)

image_byte = Request(image_url, headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'})
f = open('see.jpg', 'wb')
f.write(urlopen(image_byte).read())
f.close()

2. 여러개 이미지 수집하기

driver = webdriver.Chrome()
driver.implicitly_wait(3)
driver.get(url)
time.sleep(3)
image_area_xpath = '/html/body/div[1]/div[1]/div/div[2]/div[3]/div/div'
image_area = driver.find_element(By.XPATH, image_area_xpath)
image_elements = image_area.find_elements(By.TAG_NAME, 'img')

image_urls = []

for image_element in image_elements:
    image_url = image_element.get_attribute('data-lazy-src')
    if image_element.get_attribute('data-lazy-src') is None:
        image_url = image_element.get_attribute('src')
    print(image_url)
    image_urls.append(image_url)

for i in range(len(image_urls)):
    image_url = image_urls[i]
    image_byte = Request(image_url, headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'})
    f = open(f'sea{i}.jpg', 'wb')
    f.write(urlopen(image_byte).read())
    f.close()

3. 함수로 리팩토링

  • crawl_image(keyword, pages)
import os

def crawl_image(keyword, pages):
    q=1
    folder_name = keyword
    os.makedirs(folder_name, exist_ok=True)
    for j in range(1,pages+1):
        url = f'https://pixabay.com/ko/images/search/{keyword}/?pagi={j}'
        driver.get(url)
        time.sleep(3)
        image_area_xpath = '/html/body/div[1]/div[1]/div/div[2]/div[3]/div/div'
        image_area = driver.find_element(By.XPATH, image_area_xpath)
        image_elements = image_area.find_elements(By.TAG_NAME, 'img')

        image_urls = []

        for image_element in image_elements:
            image_url = image_element.get_attribute('data-lazy-src')
            if image_element.get_attribute('data-lazy-src') is None:
                image_url = image_element.get_attribute('src')
            print(image_url)
            image_urls.append(image_url)
       
        for i in range(len(image_urls)):
            image_url = image_urls[i]
            image_byte = Request(image_url, headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'})
            file_name = f'sea{q}.jpg'
            file_path = os.path.join(folder_name, file_name)
            with open(file_path, 'wb') as f:
                f.write(urlopen(image_byte).read())
            q += 1
driver = webdriver.Chrome()
driver.implicitly_wait(3)
crawl_image('바다',4)