#!/usr/bin/env bash
set -uo pipefail
DOMAIN="https://lightspeed-user-docs.lightspeedhq.com"
SITEMAP_URL="${DOMAIN}/sitemap.xml"
OUTPUT_FILE="all_docs.txt"
echo "Starting extraction from ${SITEMAP_URL}"
echo "" > "$OUTPUT_FILE" # clear/overwrite output file
# Extract all <loc> URLs from sitemap (skip images, pdfs, etc.)
echo "Fetching and parsing sitemap..."
curl -s --fail --max-time 15 "$SITEMAP_URL" | \
grep -oP '(?<=<loc>).*?(?=</loc>)' | \
grep -E '^https?://' | \
grep -vE '\.(jpg|jpeg|png|gif|pdf|xml|txt|zip|gz|css|js)$' | \
sort -u > urls.txt
if [ ! -s urls.txt ]; then
echo "Error: No valid URLs found in sitemap. Check sitemap URL or connectivity."
exit 1
fi
count=$(wc -l < urls.txt)
echo "Found ${count} candidate pages. Starting extraction..."
i=0
while IFS= read -r url; do
((i++))
echo "[$i/$count] Processing: ${url}"
{
printf '\n\n%s\n' "================================================================"
printf "SOURCE: %s\n" "$url"
printf '%s\n\n' "================================================================"
# Fetch → clean HTML → text → wrap at 60 chars
curl -s --max-time 20 --retry 2 --location "$url" 2>/dev/null |
lynx -stdin -dump -nolist -nomargins -width=9999 2>/dev/null |
sed '/^[[:space:]]*$/d' | # remove empty lines
fold -s -w 60 | # wrap lines ≤60 chars
sed 's/[[:space:]]*$//' # trim trailing spaces
printf '\n\n'
} >> "$OUTPUT_FILE"
# Polite delay – adjust if you get blocked (503/429)
sleep 1.8
done < urls.txt
echo ""
echo "Done. All extracted content saved to: ${OUTPUT_FILE}"
echo "Total pages processed: ${i}"
wc -l "$OUTPUT_FILE" # rough line count
For further actions, you may consider blocking this person and/or reporting abuse
Top comments (0)