Image crawling with Python on Chrome browser
2022-07-09
Step by step insturction
1. Install selenium and beautiful soup on terminal
pip install bs4
pip install pip install selenium
2. Import request, BeautifulSoup, webdriver, Keys and time
# google.py
import urllib.request #import urllib.request
from bs4 import BeautifulSoup #import BeautifulSoup
from selenium import webdriver #import webdriver
from selenium.webdriver.common.keys import Keys #import Keys
import time #import time; enablues you to use .sleep() while crwaling imges
3. Instantiate Chrome browser (Windows10-based)
If you haven’t got Chrome driver, download it and install first at ChromeDriver.
I will explain based on the local path of mine.
# google.py
# instantiate browser
binary = r'C:\Users\HP\Desktop\chromedriver\chromedriver.exe' #path to the chrome driver
browser = webdriver.Chrome(binary) #init browser
#get browser. this will open a new ready-to-search tab of Chrome browser
browser.get("https://www.google.com/imghp?hl=en&search?hl=en&q=")
# q is a name element
elem = browser.find_element_by_name("q") #init elem
elem.send_keys("golden retriever") #keywords that you wanna use to search
elem.submit() #elem.submit()
r in front of the chromdriver path can be used if some escaping-related error occurred
r'C:\Users\HP\Desktop\chromedriver\chromedriver.exe'
4. Load all possible images on the browser
# google.py
# for-loop
for i in range(1 ,10):
# find body tag and execute send_keys(Keys.END) for i < 10 so 9 times
# Keys.END is when the END key is executed to be cliecked
browser.find_element_by_xpath("//body").send_keys(Keys.END)
try:
browser.find_element_by_id("smb").click() #smb == when clicking show more result button
time.sleep(5) # sleep for 5 sec
except:
time.sleep(5) # sleep for 5 sec
time.sleep(5) # sleep for 5 sec
5. Fetch the image URLs and download them
Initialize html and soup
# google.py
html = browser.page_source # get page_source from browser
# init soup by calling the method BeautifulSoup with two parameters; the variable html and "html.parser"
soup = BeautifulSoup(html, "html.parser") #this code enables you to fetch the image urls and download the images
Create methods for fetching and downloading
# google.py
# method for listing url
def fetch_list_url():
params = [] #declare and init an array
# find all img tags and the class whose name is rg_i
imgList = soup.find_all("img", class_ ="rg_i")
#Now, extract the img sources(urls) from imgList
for im in imgList:
try:
params.append(im["src"]) #source address of an img in the class rg_i
except KeyError:
params.append(im["data-src"])
return params #return params
# method for downloading imgs from the url
def fetch_detail_url():
params = fetch_list_url() #init params by calling the method fetch_list_url()
# print(params)
a = 1 #a = 1
for p in params:
# @param p. the source img urls in params are assigned into p with this foreach-loop if urls are fetched properly
# @param path; gives download path
# @param a; gives auto-incrementing numeric file name
# finally, set .jpg extension to each of the img downloaded.
urllib.request.urlretrieve(p, r"C:\Users\HP\Documents\python_project\img/ "+ str(a) + ".jpg" )
a+=1 #a = a + 1; increment nums
#by calling the method fetch_detail_url(), the method fetch_list_url() is executed first in it.
fetch_detail_url() #fetch_detail_url()
browser.quit() #close the browser
Full code at here
References
JoCoding, RickyAvina, joygoround, 보안공돌이, itopia, Jusung’s Blog, 브라보 마이라이프, sprumin.github.io, fun-coding, Yumere, beomi.github.io
stackoverflow.com/questions/53902507
github.com/SeleniumHQ/docker-selenium
github.com/ONLYOFFICE/testing-wrata
github.com/moby/moby/issues/6758
stackoverflow.com/questions/14192709