DEV Community

Query Filter
Query Filter

Posted on

store27

#!/usr/bin/env bash

SITEMAP_FILE="sitemap.xml"
DOMAIN="https://lightspeed-user-docs.lightspeedhq.com"

if [ ! -f "$SITEMAP_FILE" ]; then
  echo "Error: sitemap.xml not found"
  exit 1
fi

grep -oP '(?<=<loc>).*?(?=</loc>)' "$SITEMAP_FILE" \
  | sed 's|^[[:space:]]*||; s|[[:space:]]*$||' \
  | sed 's|^/|'"${DOMAIN}"'/|' \
  | sed 's|/$||' \
  | grep -E '^https?://'"${DOMAIN#https://}" \
  | grep -vE '(^$|\.(jpg|jpeg|png|gif|webp|svg|pdf|xml|txt|zip|gz|css|js|json|ico)$)' \
  | sort -u > urls.txt

if [ -s urls.txt ]; then
  echo "Success! Found $(wc -l < urls.txt) URLs."
  echo "First 8 URLs:"
  head -n 8 urls.txt
else
  echo "Still 0 URLs. Showing first 20 lines of sitemap.xml:"
  head -n 20 "$SITEMAP_FILE"
fi
Enter fullscreen mode Exit fullscreen mode

Top comments (0)