store26

#!/usr/bin/env bash

set -uo pipefail

SITEMAP_FILE="sitemap.xml"
OUTPUT_FILE="all_docs.txt"
DOMAIN="https://lightspeed-user-docs.lightspeedhq.com"
REQUEST_DELAY=2.0

echo "Reading local sitemap: ${SITEMAP_FILE}"

if [ ! -f "$SITEMAP_FILE" ]; then
  echo "Error: File '${SITEMAP_FILE}' not found."
  exit 1
fi

# ── FIXED URL EXTRACTION ───────────────────────────────────────
grep -oP '(?<=<loc>).*?(?=</loc>)' "$SITEMAP_FILE" \
  | sed 's|^/|'"${DOMAIN}"'/|; s|/$||' \
  | grep -E '^https?://' \
  | grep -vE '\.(jpg|jpeg|png|gif|webp|svg|pdf|xml|txt|zip|gz|css|js|json)$' \
  | sort -u > urls.txt
# ────────────────────────────────────────────────────────────────

if [ ! -s urls.txt ]; then
  echo "Still no valid URLs found."
  echo "Please check if sitemap.xml really contains <loc> tags."
  echo "First 15 lines of sitemap.xml:"
  head -n 15 "$SITEMAP_FILE"
  exit 1
fi

count=$(wc -l < urls.txt)
echo "Found ${count} pages."
echo ""

> "$OUTPUT_FILE"

i=0
while IFS= read -r url; do
  ((i++))
  echo "[$i/${count}] ${url}"

  {
    printf '\n\n%s\n' "════════════════════════════════════════════════════════════════"
    printf "SOURCE: %s\n" "$url"
    printf '%s\n\n' "════════════════════════════════════════════════════════════════"

    curl -s --max-time 20 --retry 2 --location \
      -H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" \
      "$url" 2>/dev/null \
      | lynx -stdin -dump -nolist -nomargins -width=9999 2>/dev/null \
      | sed '/^[[:space:]]*$/d' \
      | fold -s -w 60 \
      | sed 's/[[:space:]]*$//'

    printf '\n\n'
  } >> "$OUTPUT_FILE"

  sleep "${REQUEST_DELAY}"

done < urls.txt

echo "Finished. Processed ${i} pages."
echo "Output: ${OUTPUT_FILE}  ($(du -h "$OUTPUT_FILE" | cut -f1))"
DEV Community

store26

Top comments (0)