#!/usr/bin/env bash
set -uo pipefail
SITEMAP_FILE="sitemap.xml"
OUTPUT_FILE="all_docs.txt"
DOMAIN="https://lightspeed-user-docs.lightspeedhq.com"
REQUEST_DELAY=2.0
echo "Reading local sitemap: ${SITEMAP_FILE}"
if [ ! -f "$SITEMAP_FILE" ]; then
echo "Error: File '${SITEMAP_FILE}' not found."
exit 1
fi
# ── FIXED URL EXTRACTION ───────────────────────────────────────
grep -oP '(?<=<loc>).*?(?=</loc>)' "$SITEMAP_FILE" \
| sed 's|^/|'"${DOMAIN}"'/|; s|/$||' \
| grep -E '^https?://' \
| grep -vE '\.(jpg|jpeg|png|gif|webp|svg|pdf|xml|txt|zip|gz|css|js|json)$' \
| sort -u > urls.txt
# ────────────────────────────────────────────────────────────────
if [ ! -s urls.txt ]; then
echo "Still no valid URLs found."
echo "Please check if sitemap.xml really contains <loc> tags."
echo "First 15 lines of sitemap.xml:"
head -n 15 "$SITEMAP_FILE"
exit 1
fi
count=$(wc -l < urls.txt)
echo "Found ${count} pages."
echo ""
> "$OUTPUT_FILE"
i=0
while IFS= read -r url; do
((i++))
echo "[$i/${count}] ${url}"
{
printf '\n\n%s\n' "════════════════════════════════════════════════════════════════"
printf "SOURCE: %s\n" "$url"
printf '%s\n\n' "════════════════════════════════════════════════════════════════"
curl -s --max-time 20 --retry 2 --location \
-H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" \
"$url" 2>/dev/null \
| lynx -stdin -dump -nolist -nomargins -width=9999 2>/dev/null \
| sed '/^[[:space:]]*$/d' \
| fold -s -w 60 \
| sed 's/[[:space:]]*$//'
printf '\n\n'
} >> "$OUTPUT_FILE"
sleep "${REQUEST_DELAY}"
done < urls.txt
echo "Finished. Processed ${i} pages."
echo "Output: ${OUTPUT_FILE} ($(du -h "$OUTPUT_FILE" | cut -f1))"
For further actions, you may consider blocking this person and/or reporting abuse
Top comments (0)