diff options
Diffstat (limited to 'scripts/websearch')
-rwxr-xr-x | scripts/websearch | 77 |
1 files changed, 77 insertions, 0 deletions
diff --git a/scripts/websearch b/scripts/websearch new file mode 100755 index 0000000..60c3a33 --- /dev/null +++ b/scripts/websearch @@ -0,0 +1,77 @@ +#! /bin/sh +## Hit the major search engines. Hose the [large] output to a file! +## autoconverts multiple arguments into the right format for given servers -- +## usually worda+wordb, with certain lame exceptions like dejanews. +## Extracting and post-sorting the URLs is highly recommended... +## +## Altavista currently handled by a separate script; may merge at some point. +## +## _H* original 950824, updated 951218 and 960209 + +test "${1}" = "" && echo 'Needs argument[s] to search for!' && exit 1 +PLUSARG="`echo $* | sed 's/ /+/g'`" +PIPEARG="`echo ${PLUSARG} | sed 's/+/|/g'`" +IFILE=/tmp/.webq.$$ + +# Don't have "nc"? Get "netcat" from avian.org and add it to your toolkit. +doquery () { + echo GET "$1" | nc -v -i 1 -w 30 "$2" "$3" +} + +# changed since original: now supplying port numbers and separator lines... + +echo "=== Yahoo ===" +doquery "/bin/search?p=${PLUSARG}&n=300&w=w&s=a" search.yahoo.com 80 + +echo '' ; echo "=== Webcrawler ===" +doquery "/cgi-bin/WebQuery?searchText=${PLUSARG}&maxHits=300" webcrawler.com 80 + +# the infoseek lamers want "registration" before they do a real search, but... +echo '' ; echo "=== Infoseek ===" +echo " is broken." +# doquery "WW/IS/Titles?qt=${PLUSARG}" www2.infoseek.com 80 +# ... which doesn't work cuz their lame server wants the extra newlines, WITH +# CRLF pairs ferkrissake. Fuck 'em for now, they're hopelessly broken. If +# you want to play, the basic idea and query formats follow. +# echo "GET /WW/IS/Titles?qt=${PLUSARG}" > $IFILE +# echo "" >> $IFILE +# nc -v -w 30 guide-p.infoseek.com 80 < $IFILE + +# this is kinda flakey; might have to do twice?? +echo '' ; echo "=== Opentext ===" +doquery "/omw/simplesearch?SearchFor=${PLUSARG}&mode=phrase" \ + search.opentext.com 80 + +# looks like inktomi will only take hits=100, or defaults back to 30 +# we try to suppress all the stupid rating dots here, too +echo '' ; echo "=== Inktomi ===" +doquery "/query/?query=${PLUSARG}&hits=100" ink3.cs.berkeley.edu 1234 | \ + sed '/^<IMG ALT.*inktomi.*\.gif">$/d' + +#djnews lame shit limits hits to 120 and has nonstandard format +echo '' ; echo "=== Dejanews ===" +doquery "/cgi-bin/nph-dnquery?query=${PIPEARG}+maxhits=110+format=terse+defaultOp=AND" \ + smithers.dejanews.com 80 + +# OLD lycos: used to work until they fucking BROKE it... +# doquery "/cgi-bin/pursuit?query=${PLUSARG}&maxhits=300&terse=1" \ +# query5.lycos.cs.cmu.edu 80 +# NEW lycos: wants the User-agent field present in query or it returns nothing +# 960206: webmaster@lycos duly bitched at +# 960208: reply received; here's how we will now handle it: +echo \ +"GET /cgi-bin/pursuit?query=${PLUSARG}&maxhits=300&terse=terse&matchmode=and&minscore=.5 HTTP/1.x" \ + > $IFILE +echo "User-agent: *FUCK OFF*" >> $IFILE +echo "Why: go ask todd@pointcom.com (Todd Whitney)" >> $IFILE +echo '' >> $IFILE +echo '' ; echo "=== Lycos ===" +nc -v -i 1 -w 30 twelve.srv.lycos.com 80 < $IFILE + +rm -f $IFILE +exit 0 + +# CURRENTLY BROKEN [?] +# infoseek + +# some args need to be redone to ensure whatever "and" mode applies |