What will be scraped
Full Code
If you don't need an explanation, have a look at the full code example in the online IDE.
from selenium import webdriver
from selenium_stealth import stealth
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selectolax.lexbor import LexborHTMLParser
import time, json
def start_selenium_stealth(url):
service = Service(ChromeDriverManager().install())
options = webdriver.ChromeOptions()
options.add_argument('start-maximized')
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(service=service, options=options)
stealth(
driver,
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
languages=['en-US', 'en'],
vendor='Google Inc.',
platform='Win32',
webgl_vendor='Intel Inc.',
renderer='Intel Iris OpenGL Engine',
fix_hairline=True,
)
driver.get(url)
time.sleep(2)
return driver
def scrape_google_things_to_do(driver):
data = []
parser = LexborHTMLParser(driver.page_source)
data.append({
'category': 'Top sights',
'results': get_category_data(parser)
})
buttons = driver.find_elements(By.CSS_SELECTOR, '.iydyUc button')
categories = parser.root.css('.m1GHmf')
for button, category in zip(buttons, categories):
driver.execute_script('arguments[0].click();', button)
time.sleep(2)
parser = LexborHTMLParser(driver.page_source)
data.append({
'category': category.text(),
'results': get_category_data(parser)
})
driver.quit()
return data
def get_category_data(parser):
category_data = []
for result in parser.root.css('.T1Yjbc'):
title = result.css_first('.YmWhbc').text()
description = result.css_first('.nFoFM').text()
rating = result.css_first('.lA0BZ')
rating = float(rating.text()) if rating else None
reviews = result.css_first('.XLC8M')
reviews = int(reviews.text()[2:-1].replace(',', '')) if reviews else None
thumbnail = result.css_first('.R1Ybne').attributes
thumbnail = thumbnail.get('src') if thumbnail.get('src') else thumbnail.get('data-src')
category_data.append({
'title': title,
'description': description,
'rating': rating,
'reviews': reviews,
'thumbnail': thumbnail
})
return category_data
def main():
URL = 'https://www.google.com/travel/things-to-do/see-all?dest_mid=%2Fm%2F06c62&dest_state_type=sattd&q=Rome'
driver = start_selenium_stealth(URL)
google_things_to_do = scrape_google_things_to_do(driver)
print(json.dumps(google_things_to_do, indent=2, ensure_ascii=False))
if __name__ == "__main__":
main()
Preparation
Install libraries:
pip install selenium selenium-stealth webdriver webdriver_manager selectolax
Code Explanation
Import libraries:
from selenium import webdriver
from selenium_stealth import stealth
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selectolax.lexbor import LexborHTMLParser
import time, json
Library | Purpose |
---|---|
webdriver |
to drive a browser natively, as a user would, either locally or on a remote machine using the Selenium server. |
selenium-stealth |
to prevent almost all website detections (CAPTCHA and so on). |
Service |
to manage the starting and stopping of the ChromeDriver. |
By |
to set of supported locator strategies (By.ID, By.TAG_NAME, By.XPATH etc). |
LexborHTMLParser |
a fast HTML5 parser with CSS selectors using Lexbor engine. |
time |
to work with time in Python. |
json |
to convert extracted data to a JSON object. |
The next part of the code is divided into functions. Each function is described in the corresponding heading below.
Top-level code environment
The URL variable contains a link to the Google "Things to do" page:
URL = 'https://www.google.com/travel/things-to-do/see-all?dest_mid=%2Fm%2F06c62&dest_state_type=sattd&q=Rome'
📌Note: This link leads to "What to see" tab, and this blog post focuses only on this link.
There are 3 different tabs on the Google "Things to do" page:
Each tab has its own link:
# Highlights
https://www.google.com/travel/things-to-do?dest_mid=%2Fm%2F06c62&q=Rome&dest_state_type=main
# What to see
https://www.google.com/travel/things-to-do/see-all?dest_mid=%2Fm%2F06c62&q=Rome&dest_state_type=sattd
# What to do
https://www.google.com/travel/things-to-do/experiences?dest_mid=%2Fm%2F06c62&q=Rome&dest_state_type=sae
Next, the URL is passed to the start_selenium_stealth(URL)
function to start selenium in stealth mode. The driver returned by this function is passed to the scrape_google_things_to_do(driver)
function to extract all the data. The explanation of these functions will be in the corresponding headings below.
driver = start_selenium_stealth(URL)
google_things_to_do = scrape_google_things_to_do(driver)
After the all data is retrieved, it is output in JSON format:
print(json.dumps(google_things_to_do, indent=2, ensure_ascii=False))
This code uses boilerplate __name__ == "__main__"
construct that protects users from accidentally invoking the script when they didn't intend to. This indicates that the code is a runnable script:
def main():
URL = 'https://www.google.com/travel/things-to-do/see-all?dest_mid=%2Fm%2F06c62&dest_state_type=sattd&q=Rome'
driver = start_selenium_stealth(URL)
google_things_to_do = scrape_google_things_to_do(driver)
print(json.dumps(google_things_to_do, indent=2, ensure_ascii=False))
if __name__ == "__main__":
main()
This check will only be performed if the user has run this file. If the user imports this file into another, then the check will not work.
You can watch the video Python Tutorial: if name == 'main' for more details.
Start selenium stealth
The function takes the URL and returns a selenium driver.
The selenium
library is used to simulate user actions in the browser. For selenium
to work, you need to use ChromeDriver
, which can be downloaded manually or using code. In our case, the second method is used. To control the start and stop of ChromeDriver
, you need to use Service
which will install browser binaries under the hood:
service = Service(ChromeDriverManager().install())
You should also add options
to work correctly:
options = webdriver.ChromeOptions()
options.add_argument('start-maximized')
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option('useAutomationExtension', False)
Now we can start webdriver
:
driver = webdriver.Chrome(service=service, options=options)
After starting the webdriver
, you need to pass it and other attributes to the stealth()
function. This will make selenium more stealthy to be able to bypass CAPTCHA (Cloudflare one also):
stealth(
driver,
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
languages=['en-US', 'en'],
vendor='Google Inc.',
platform='Win32',
webgl_vendor='Intel Inc.',
renderer='Intel Iris OpenGL Engine',
fix_hairline=True,
)
Pass the URL to the get()
method and wait a few seconds for the site to load:
driver.get(url)
time.sleep(2)
At the end of the function, the driver
is returned:
return driver
The function looks like this:
def start_selenium_stealth(url):
service = Service(ChromeDriverManager().install())
options = webdriver.ChromeOptions()
options.add_argument('start-maximized')
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(service=service, options=options)
stealth(
driver,
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
languages=['en-US', 'en'],
vendor='Google Inc.',
platform='Win32',
webgl_vendor='Intel Inc.',
renderer='Intel Iris OpenGL Engine',
fix_hairline=True,
)
driver.get(url)
time.sleep(2)
return driver
Scrape Google "Things to do" page
The function takes a selenium driver and returns the extracted data.
The data
list is declared to which the extracted data will be added:
data = []
Passing HTML content to Lexbor
:
parser = LexborHTMLParser(driver.page_source)
At the moment, the page displays the top sights. A dictionary with received data from the "Top sights" category is added to the data
list. The get_category_data(parser)
function returns a list of the extracted places:
data.append({
'category': 'Top sights',
'results': get_category_data(parser)
})
After that we get lists from all categories and buttons:
buttons = driver.find_elements(By.CSS_SELECTOR, '.iydyUc button')
categories = parser.root.css('.m1GHmf')
Then we iterate these lists in a loop. In each iteration of the loop, the following algorithm is executed:
- Click on the button that is responsible for the special category.
- Wait a few seconds for the page to load.
- Pass HTML content to
Lexbor
. - Add a dictionary with extracted data from this category to the
data
list.
for button, category in zip(buttons, categories):
driver.execute_script('arguments[0].click();', button)
time.sleep(2)
parser = LexborHTMLParser(driver.page_source)
data.append({
'category': category.text(),
'results': get_category_data(parser)
})
After all the operations are done, stop the driver and return the data
list:
driver.quit()
return data
The function looks like this:
def scrape_google_things_to_do(driver):
data = []
parser = LexborHTMLParser(driver.page_source)
data.append({
'category': 'Top sights',
'results': get_category_data(parser)
})
buttons = driver.find_elements(By.CSS_SELECTOR, '.iydyUc button')
categories = parser.root.css('.m1GHmf')
for button, category in zip(buttons, categories):
driver.execute_script('arguments[0].click();', button)
time.sleep(2)
parser = LexborHTMLParser(driver.page_source)
data.append({
'category': category.text(),
'results': get_category_data(parser)
})
driver.quit()
return data
Get category data
The function takes a parser and returns the extracted data by category.
The category_data
list is declared to which the extracted data will be added:
category_data = []
The data extraction itself is done with selectolax
because it has Lexbor
parser which is incredibly fast, like 186% faster compared to bs4
with lxml
backend when parsing data with 3000 iterations 5 times.
In terms of syntax, it is very similar to both bs4
and parsel
, making it easy to use. Please note that selectolax
does not currently support XPath.
In order to find all the places, you need to use the css()
method and pass the .T1Yjbc
selector there:
for result in parser.root.css('.T1Yjbc'):
# data extraction will be here
Next, for each place, you need to extract all the data about it. Data such as title
and description
are easily retrieved. Using the css_first() method
, we find the desired elements. For them, the text content is retrieved, so the corresponding text()
method is used:
title = result.css_first('.YmWhbc').text()
description = result.css_first('.nFoFM').text()
Sometimes it happens that in some places there are no data such as ratings
and reviews
. So a ternary expression is used for such cases:
rating = result.css_first('.lA0BZ')
rating = float(rating.text()) if rating else None
When retrieving reviews
the user gets an element like this:
<span class="jdzyld XLC8M " aria-hidden="true"> (329,696)</span>
In order to extract a numeric value, you need to do the following operations: slice a list by removing space and brackets; remove the comma.
reviews = result.css_first('.XLC8M')
reviews = int(reviews.text()[2:-1].replace(',', '')) if reviews else None
When extracting thumbnail
, I noticed that the required value is stored in different attributes:
Therefore, when extracting a thumbnail
, we first check whether there is an attribute src
. If this attribute exists, then retrieve its value. Otherwise, we extract the value from the data-src
attribute:
thumbnail = result.css_first('.R1Ybne').attributes
thumbnail = thumbnail.get('src') if thumbnail.get('src') else thumbnail.get('data-src')
Add dictionary with all extracted data to the category_data
list:
category_data.append({
'title': title,
'description': description,
'rating': rating,
'reviews': reviews,
'thumbnail': thumbnail
})
At the end of the function, the category_data
list is returned:
return category_data
The function looks like this:
def get_category_data(parser):
category_data = []
for result in parser.root.css('.T1Yjbc'):
title = result.css_first('.YmWhbc').text()
description = result.css_first('.nFoFM').text()
rating = result.css_first('.lA0BZ')
rating = float(rating.text()) if rating else None
reviews = result.css_first('.XLC8M')
reviews = int(reviews.text()[2:-1].replace(',', '')) if reviews else None
thumbnail = result.css_first('.R1Ybne').attributes
thumbnail = thumbnail.get('src') if thumbnail.get('src') else thumbnail.get('data-src')
category_data.append({
'title': title,
'description': description,
'rating': rating,
'reviews': reviews,
'thumbnail': thumbnail
})
return category_data
Output
[
{
"category": "Top sights",
"results": [
{
"title": "Colosseum",
"description": "Iconic ancient Roman gladiatorial arena",
"rating": 4.7,
"reviews": 329514,
"thumbnail": "https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcSqsodn85JD0w-M7U-BpFSwvCcj_XGXejYVMoWU8KyL6KPogCN5Mw_NRY9v3kYN5L3-r1DCz_VdR1YysZta5nO5OQ"
},
... other results
]
},
{
"category": "Local favorites",
"results": [
{
"title": "Centrale Montemartini",
"description": "Sculpture museum in a former power plant",
"rating": 4.7,
"reviews": 3622,
"thumbnail": "https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcTQyIoBqIledLbNakjEisXXQxB6jnUK1AHhyplqqwGDuN7UkG-U_gs6Bvb0K9pu4jkb3F1eb_46XkEGpKnKp-R99Q"
},
... other results
]
},
{
"category": "Kid-friendly",
"results": [
{
"title": "Explora - Il Museo dei Bambini di Roma",
"description": "Interactive museum for kids' learning",
"rating": 4.4,
"reviews": 3750,
"thumbnail": "https://lh5.googleusercontent.com/p/AF1QipODf1pdsNaAGmRDnDzr9oL6JVML7AzxM6Ac7Nfb=w464-h260-n-k-no"
},
... other results
]
},
{
"category": "Art & Culture",
"results": [
{
"title": "Trevi Fountain",
"description": "Iconic 18th-century sculpted fountain",
"rating": 4.8,
"reviews": 344287,
"thumbnail": "https://encrypted-tbn3.gstatic.com/licensed-image?q=tbn:ANd9GcQ3oYnBk3YBSnYMN5lm9LCsAYjKCu8aTUmq9IcagmcIMWUIbcp1P05qBe2ZCPIqJiGIZMrLhDC6vJqDfigCU-RYVQ"
},
... other results
]
},
{
"category": "History",
"results": [
{
"title": "St. Peter's Basilica",
"description": "World’s largest basilica of Christianity",
"rating": 4.8,
"reviews": 133218,
"thumbnail": "https://encrypted-tbn2.gstatic.com/licensed-image?q=tbn:ANd9GcTN7ZkQkBKeCk_3pPWGkP89Nh1HZRsUS1VABgURN4xCbafTl3ykf3hsHzSIgeMfWV2GPjWPBB9USYfBqlIjGrCwzQ"
},
... other results
]
},
{
"category": "Outdoors",
"results": [
{
"title": "Colosseum",
"description": "Iconic ancient Roman gladiatorial arena",
"rating": 4.7,
"reviews": 329514,
"thumbnail": "https://encrypted-tbn2.gstatic.com/licensed-image?q=tbn:ANd9GcT6QDJmOIFe5RKBYZeXMaqSbeby5JGoyKYgCQx_QYR_d5u_9jew-QEyMkHJm-RlphGMyeGNWSrK0mkGiRhapCdgLg"
},
... other results
]
},
{
"category": "Museums",
"results": [
{
"title": "Sistine Chapel",
"description": "Michelangelo's iconic painted ceiling",
"rating": 4.7,
"reviews": 57439,
"thumbnail": "https://encrypted-tbn0.gstatic.com/licensed-image?q=tbn:ANd9GcSS5FraPHJIrRHAjLscN_heiHqyxeYk-SSfdHxBEHWSJccjR_k-3_8yQQvfw9jJ26M3OPuVFlGWi0nxmG1et__Wuw"
},
... other results
]
},
{
"category": "Amusement Parks",
"results": [
{
"title": "Cinecittà World",
"description": "Amusement park with rides & shows",
"rating": 3.9,
"reviews": 11920,
"thumbnail": "https://encrypted-tbn0.gstatic.com/licensed-image?q=tbn:ANd9GcTValH23IxS-mwl6aDBgi1zC_z-o0Oy9xnJlq6KsmiJSlYCLUcrfqoyuWNXAEdan00ykkVO90Ef8wRYX8SMlbtwnQ"
},
... other results
]
},
{
"category": "Markets",
"results": [
{
"title": "Porta Portese",
"description": "Large flea market, open Sundays",
"rating": 4.2,
"reviews": 5381,
"thumbnail": "https://encrypted-tbn1.gstatic.com/licensed-image?q=tbn:ANd9GcRDjn4p2s2qFdoAtywEis1F_05IvFZd7lCDVxU3ICCcGIAvyCv1mdetSjr6izoid1DjbbEgQlRxf2S6utMqqwNe_Q"
},
... other results
]
}
]
Top comments (0)