#!/usr/bin/env bash
set -uo pipefail
# ────────────────────────────────────────────────
# CONFIGURATION
# ────────────────────────────────────────────────
SITEMAP_FILE="sitemap.xml" # ← change if your file has different name/path
OUTPUT_FILE="all_docs.txt"
DOMAIN="https://lightspeed-user-docs.lightspeedhq.com"
# Polite delay between requests (seconds) – increase if you get blocked
REQUEST_DELAY=2.0
# ────────────────────────────────────────────────
# MAIN LOGIC
# ────────────────────────────────────────────────
echo "Reading local sitemap: ${SITEMAP_FILE}"
if [ ! -f "$SITEMAP_FILE" ]; then
echo "Error: File '${SITEMAP_FILE}' not found in current directory."
echo "Please place sitemap.xml in the same folder as this script."
exit 1
fi
# Extract all <loc> URLs, filter out non-html resources, sort unique
grep -oP '(?<=<loc>).*?(?=</loc>)' "$SITEMAP_FILE" \
| grep -E '^https?://' \
| grep -vE '\.(jpg|jpeg|png|gif|webp|svg|pdf|xml|txt|zip|gz|css|js|json)$' \
| grep -F "${DOMAIN}" \
| sort -u > urls.txt
if [ ! -s urls.txt ]; then
echo "Error: No valid page URLs found in ${SITEMAP_FILE}."
echo "Possible reasons:"
echo " - It's a sitemap index (contains other .xml files instead of pages)"
echo " - No <loc> tags with http/https URLs"
echo " - All URLs are images/PDFs/etc."
echo "Check the file manually or share first 20 lines."
exit 1
fi
count=$(wc -l < urls.txt)
echo "Found ${count} candidate documentation pages."
echo "Output will be written to: ${OUTPUT_FILE}"
echo ""
# Clear / start fresh output file
> "$OUTPUT_FILE"
i=0
while IFS= read -r url; do
((i++))
echo "[$i/${count}] ${url}"
{
printf '\n\n%s\n' "════════════════════════════════════════════════════════════════"
printf "SOURCE: %s\n" "$url"
printf '%s\n\n' "════════════════════════════════════════════════════════════════"
# Fetch → convert to readable text → wrap lines at 60 characters
curl -s --max-time 20 --retry 2 --location \
-H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" \
"$url" 2>/dev/null \
| lynx -stdin -dump -nolist -nomargins -width=9999 2>/dev/null \
| sed '/^[[:space:]]*$/d' \
| fold -s -w 60 \
| sed 's/[[:space:]]*$//'
printf '\n\n'
} >> "$OUTPUT_FILE"
sleep "${REQUEST_DELAY}"
done < urls.txt
echo ""
echo "Finished."
echo "Total pages processed: ${i}"
echo "Output file: ${OUTPUT_FILE}"
echo "Size: $(du -h "$OUTPUT_FILE" | cut -f1)"
For further actions, you may consider blocking this person and/or reporting abuse
Top comments (0)