ABOUT ME

-

Today
-
Yesterday
-
Total
-
  • (4) 픽사베이 이미지 크롤링
    크롤링 2023. 7. 9. 16:53
    728x90

    1. 이미지 수집하기

     

    import chromedriver_autoinstaller
    import time
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from urllib.request import Request, urlopen
     
    driver = webdriver.Chrome()
    driver.implicitly_wait(3)

    driver.get(url)
    time.sleep(3)
     
    image_xpath = '/html/body/div[1]/div[1]/div/div[2]/div[3]/div/div/div[4]/div[3]/div/a/img'
    image_url = driver.find_element(By.XPATH, image_xpath).get_attribute('src')
    print(image_url)

    image_byte = Request(image_url, headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'})
    f = open('see.jpg', 'wb')
    f.write(urlopen(image_byte).read())
    f.close()

    2. 여러개 이미지 수집하기

    driver = webdriver.Chrome()
    driver.implicitly_wait(3)
    driver.get(url)
    time.sleep(3)
    image_area_xpath = '/html/body/div[1]/div[1]/div/div[2]/div[3]/div/div'
    image_area = driver.find_element(By.XPATH, image_area_xpath)
    image_elements = image_area.find_elements(By.TAG_NAME, 'img')

    image_urls = []

    for image_element in image_elements:
        image_url = image_element.get_attribute('data-lazy-src')
        if image_element.get_attribute('data-lazy-src') is None:
            image_url = image_element.get_attribute('src')
        print(image_url)
        image_urls.append(image_url)

    for i in range(len(image_urls)):
        image_url = image_urls[i]
        image_byte = Request(image_url, headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'})
        f = open(f'sea{i}.jpg', 'wb')
        f.write(urlopen(image_byte).read())
        f.close()

    3. 함수로 리팩토링

    • crawl_image(keyword, pages)
    import os

    def crawl_image(keyword, pages):
        q=1
        folder_name = keyword
        os.makedirs(folder_name, exist_ok=True)
        for j in range(1,pages+1):
            url = f'https://pixabay.com/ko/images/search/{keyword}/?pagi={j}'
            driver.get(url)
            time.sleep(3)
            image_area_xpath = '/html/body/div[1]/div[1]/div/div[2]/div[3]/div/div'
            image_area = driver.find_element(By.XPATH, image_area_xpath)
            image_elements = image_area.find_elements(By.TAG_NAME, 'img')

            image_urls = []

            for image_element in image_elements:
                image_url = image_element.get_attribute('data-lazy-src')
                if image_element.get_attribute('data-lazy-src') is None:
                    image_url = image_element.get_attribute('src')
                print(image_url)
                image_urls.append(image_url)
           
            for i in range(len(image_urls)):
                image_url = image_urls[i]
                image_byte = Request(image_url, headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'})
                file_name = f'sea{q}.jpg'
                file_path = os.path.join(folder_name, file_name)
                with open(file_path, 'wb') as f:
                    f.write(urlopen(image_byte).read())
                q += 1
    driver = webdriver.Chrome()
    driver.implicitly_wait(3)
    crawl_image('바다',4)

    '크롤링' 카테고리의 다른 글

    (5) 지니 크롤링  (0) 2023.07.09
    (3) 인스타그램 크롤링  (0) 2023.07.09
    (2) 셀레니움  (0) 2023.07.09
    (1) 크롤링  (1) 2023.06.18

    댓글

Designed by Tistory.