| | #!/bin/bash |
| |
|
| | function usage { |
| | echo "usage: <n>$0" |
| | echo "note: n is the number of essays to download" |
| | echo "for specific n, the resulting pg.txt file will have the following number of tokens:" |
| | echo "n | tokens" |
| | echo "--- | ---" |
| | echo "1 | 6230" |
| | echo "2 | 23619" |
| | echo "5 | 25859" |
| | echo "10 | 36888" |
| | echo "15 | 50188" |
| | echo "20 | 59094" |
| | echo "25 | 88764" |
| | echo "30 | 103121" |
| | echo "32 | 108338" |
| | echo "35 | 113403" |
| | echo "40 | 127699" |
| | echo "45 | 135896" |
| | exit 1 |
| | } |
| |
|
| | function has_cmd { |
| | if ! [ -x "$(command -v $1)" ]; then |
| | echo "error: $1 is not available" >&2 |
| | exit 1 |
| | fi |
| | } |
| |
|
| | |
| | has_cmd curl |
| | has_cmd html2text |
| | has_cmd tail |
| | has_cmd sed |
| |
|
| | if [ $# -ne 1 ]; then |
| | usage |
| | fi |
| |
|
| | n=$1 |
| |
|
| | |
| | urls="$(curl http://www.aaronsw.com/2002/feeds/pgessays.rss | grep html | sed -e "s/.*http/http/" | sed -e "s/html.*/html/" | head -n $n)" |
| |
|
| | printf "urls:\n%s\n" "$urls" |
| |
|
| | if [ -f pg.txt ]; then |
| | rm pg.txt |
| | fi |
| |
|
| | c=1 |
| | for url in $urls; do |
| | echo "processing $url" |
| |
|
| | cc=$(printf "%03d" $c) |
| |
|
| | curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg-$cc-one.txt |
| | cat pg-$cc-one.txt >> pg.txt |
| |
|
| | cp -v pg.txt pg-$cc-all.txt |
| | c=$((c+1)) |
| |
|
| | |
| | sleep 1 |
| | done |
| |
|
| | echo "done. data in pg.txt" |
| |
|
| | exit 0 |
| |
|