// function to create array of elements from an xpath
let xeval = (s, each = e => e) => {
const iter = document.evaluate(s, document, null, XPathResult.ANY_TYPE, null );
const elems = []
while(elem = iter.iterateNext()) {
elems.push(each(elem))
}
return elems
}
I was recently helping someone scrape app reviews and came across this tip. This will work to scrape any list / feed of elements.
The Problem
A lot of sites generate class names on page load, so you can't always use class names / id's to pull page data. The solution is to use XPATH. I was surprised by how easy this is!
Scroll down the page a bit and choose an element you want to scrape
Inspect the element and copy it's XPath
right click element in dev window -> copy -> XPath / full XPath
it will look like:
'.../div/div[10]/div/div[2]/div[1]/div[1]/span'
by deleting '[10]' all like elements will be selected
use the XPath to select the page elements
And that's it!
Here's the code
// GRAB USER REVIEWS FROM THE GOOGLE PLAY STORE (works as of 5/2020) | |
// first load the page so all reviews are showing (hold space bar until it stops loading reviews) | |
// then open up developer tools ctrl-l and type the following AFTER UNDERSTANDING WHAT IT DOES | |
// never paste code you don't understand into the developer tool | |
// Goolge generates their css classes on page load | |
// use XPATH to extract html based on content / position | |
// this wont change based on page load | |
// function to create array of elements from an xpath | |
let xeval = (s, each = e => e) => { | |
const iter = document.evaluate(s, document, null, XPathResult.ANY_TYPE, null ); | |
const elems = [] | |
while(elem = iter.iterateNext()) { | |
elems.push(each(elem)) | |
} | |
return elems | |
} | |
// grab content on the page | |
// copy the x path for each element and modify to select all similar elements | |
// inspect element in development console and choose copy -> full x path | |
// note: the full xpath doesn't depend on class names or ids | |
/* | |
to get all reviews, choose the xpath for a single review which is down the page some | |
it will look like: | |
.../div/div[214]/div/div[2]/div[1]/div[1]/span | |
by deleting '[214]' all like elements will be selected | |
*/ | |
// click all 'read more' buttons to load full reviews | |
// grab all buttons | |
let buttons_xpath = '/html/body/div[1]/div[4]/c-wiz/div/div[2]/div/div[1]/div/div/div[1]/div[2]/div/div/div/div[2]/div[2]/span[1]/div/button' | |
xeval(buttons_xpath).forEach(b => b.click()) | |
let name_xpath = '/html/body/div[1]/div[4]/c-wiz/div/div[2]/div/div[1]/div/div/div[1]/div[2]/div/div/div/div[2]/div[1]/div[1]/span' | |
let names = xeval(name_xpath, e => e.innerText) | |
let helpful_score_xpath = '/html/body/div[1]/div[4]/c-wiz/div/div[2]/div/div[1]/div/div/div[1]/div[2]/div/div/div/div[2]/div[1]/div[2]/div/div[1]/div[2]' | |
let helpful_scores = xeval(helpful_score_xpath, e => e.innerText == "" ? "0" : e.innerText) | |
let date_xpath = '/html/body/div[1]/div[4]/c-wiz/div/div[2]/div/div[1]/div/div/div[1]/div[2]/div/div/div/div[2]/div[1]/div[1]/div/span[2]' | |
let dates = xeval(date_xpath, e => e.innerText) | |
let review_xpath = '/html/body/div[1]/div[4]/c-wiz/div/div[2]/div/div[1]/div/div/div[1]/div[2]/div/div/div/div[2]/div[2]/span[1]' | |
let reviews = xeval(review_xpath, e => e.innerText) | |
// star ratings are stored in text in the 'aria-label' attribute | |
// copy the x path of all star rating elements and append '@aria-label' to select each | |
let star_rating_xpath = '/html/body/div[1]/div[4]/c-wiz/div/div[2]/div/div[1]/div/div/div[1]/div[2]/div/div/div/div[2]/div[1]/div[1]/div/span[1]/div/div/@aria-label' | |
// grab the value of each attribute and use regex to extract the rating number | |
let star_ratings = xeval(star_rating_xpath).map(attr => /[0-5]/.exec(attr.value)[0]) | |
// check all arrays have the same size (sanity check) | |
// there are simpler / more readable ways to do this check | |
// if (!(names.length === helpful_scores.length && ... | |
if (![names, helpful_scores, dates, reviews, star_ratings].reduce((b, array) => b & array.length === names.length, true)) { | |
alert('something went wrong') | |
} | |
// store all reviews in a string seperated by ";;" | |
let all_reviews = "" | |
for (let i = 0; i < names.length; ++i) { | |
const name = names[i] | |
const date = dates[i] | |
const rating = star_ratings[i] | |
const helpful_score = helpful_scores[i] | |
const review = reviews[i] | |
all_reviews += `${name}, ${date}, ${rating} stars, ${helpful_score} people found this helpful, ${review} ;;\n` | |
} | |
// copy all_reviews to clipboard (ctr-c) | |
copy(all_reviews) | |
Top comments (0)