#!/usr/bin/env bash
SITEMAP_FILE="sitemap.xml"
DOMAIN="https://lightspeed-user-docs.lightspeedhq.com"
if [ ! -f "$SITEMAP_FILE" ]; then
echo "Error: sitemap.xml not found"
exit 1
fi
grep -oP '(?<=<loc>).*?(?=</loc>)' "$SITEMAP_FILE" \
| sed 's|^[[:space:]]*||; s|[[:space:]]*$||' \
| sed 's|^/|'"${DOMAIN}"'/|' \
| sed 's|/$||' \
| grep -E '^https?://'"${DOMAIN#https://}" \
| grep -vE '(^$|\.(jpg|jpeg|png|gif|webp|svg|pdf|xml|txt|zip|gz|css|js|json|ico)$)' \
| sort -u > urls.txt
if [ -s urls.txt ]; then
echo "Success! Found $(wc -l < urls.txt) URLs."
echo "First 8 URLs:"
head -n 8 urls.txt
else
echo "Still 0 URLs. Showing first 20 lines of sitemap.xml:"
head -n 20 "$SITEMAP_FILE"
fi
For further actions, you may consider blocking this person and/or reporting abuse
Top comments (0)