#!/bin/sh # url.sh harvest URL links from web dirs 2021-10-23, LTu # assume safe dir and file names and safe input, too # really far from perfect parsing, mind the results # try to cope with .htaccess txt html pre and normal cases find . -type f \ |\ xargs file \ |\ grep ' text' \ |\ awk -F: '{print $1}' \ |\ xargs cat \ |\ perl -ane 'printf "%s\n", /^Redirect/ ? $F[-1] : $_' \ |\ perl -pe 's,,\n$1\n,g; s,,,g' \ |\ sed -n '/^http/p' \ |\ uniq # if debug, with sed maybe '/^http/p; w skipped.txt'