/* 1 Nov 2002 : GRABSITE Ver 1.2 This will take an html document and find all and def_tofile='index.htm' then the contents of this url would be written to: destination_dir\sports\scoreboard\index.htm */ def_tofile='INDEX.HTM' /*If HTML Document mode is selected, then only links ending with these extensions are downloaded, examined, and written. Notes: * In all cases, if the content-type header is NOT text/html, the contents will NOT be examined. * If HTMLEXTS='', then this test is not performed * If NOT_HTMLEXTS='', then this test is not performed */ htmlexts='SHTML SHT HTM HTML HTML-SSI HTM-SSI' /* log file. If none desired, set=0. Otherwise, enter a filename. Note that old log files will be deleted/overwritten */ logfile='GRABSITE.LOG' /* maximum time allowed to get a resouce, in seconds */ max_time=40 /* nocgi=1 to skip CGI urls (that have a /CGI in their path */ nocgi=1 /* space delimited list of strings that signfies "this is a CGI, or other protocol, script". If left blank, /CGI is used. Notes: * if ANY of these strings occur anywhere in the url, the url will NOT be retriered * a case insensitive comparison is used */ nocgi_strings='/CGI' /* nosearch=1 to skip urls that end with ?xxx (where xxx is a string of any length)*/ nosearch=1 /*If HTML Document mode is selected, then links ending with these extensions are NOT downloaded. Notes: * If "retrieve all links" mode is specified, then not_htmlexts is ignored * If HTMLEXTS<>'', then this test is not performed * If "retrieve all links" mode is specified, then htmlexts is ignored * If HTMLEXTS='', then this test is not performed */ not_htmlexts='JPG GIF BMP ZIP GZ TIF TIFF MOV AU EXE COM WAV XBM PDF PS EPS ' /* overwrite=1 means "overwrite preexisting files. Otherwise, don't overwrite */ overwrite=1 /* optional request header(s) to send to servers Note: use '0d0a'x to seperate multiple request headers */ reqheaders='User-agent: GrabSite' /* if URL's path starts with remove_prefix, then trim the beginning of the path (remove everything up to the first /) For example, if remove_prefix='!RANGE and a link is /!RANGE:bytes=100-200/surplus/prices.lst then /surplus/prices.lst is used */ remove_prefix='!RANGE' /* If robot_check=1, then check for a /ROBOTS.TXT file. This contains instructions on what paths should not be visited by "web robots". */ robot_check=1 /* if URL's path starts with skip_prefix, then skip it This is only needed when the "retrieve" test is /. */ skip_prefix='!' /* Status reports: -2 for NO status output, -1 for minimal, 0 for average 1 for some, 2 for too much */ verbose=2 /********** END USER changeable parameters ***********/ parse arg afile destdir includer includer2 write_all cmdline=0 if afile<>'' then do afile=translate(afile,'/','\') cmdline=1 end /* do */ write_all0=write_all if afile='?' then do say "GrabSite -- GET a linked set of pages from the WWW" say say "Calling syntax: GrabSite URL DestDir Test1 Test2 Get_all " say " where:" say " URL = a fully qualified URL (the home page to start at)" say " DestDir = destination directory (on local disk) to write results to" say " Test1 = only parse documents in/under this prefix " say " Test2 = only retrieve documents in/under this prefix " say " Get_all = if 0, then do NOT get non-html documents " say " " say " *** Note: to avoid command line problems: use \ instead of / ***" say " " say "Example: " say " D:>grabsite http:\\fu.br.net\circ\index.htm d:\foob \circ\ \ 1 " say say "Or .. enter without arguments for user prompts" exit end /* do */ /* initialize some stuff */ baseurl='' rootdir='' include_depth=0 depth_also='A' includer=translate(translate(includer,'/','\')) includer2=translate(translate(includer2,'/','\')) remove_prefix=translate(remove_prefix) skip_prefix=translate(skip_prefix) htmlexts=translate(htmlexts) not_htmlexts=translate(not_htmlexts) ndeleted=0 nwritten=0 ; noconnects=0 ngets=0; n400s=0 nparsed=0 if nocgi_strings='' then nocgi_strings='/CGI' crlf='0d0a'x fileurls.0=0 flist.0=0 call loaddll /* load some dlls, set some parameters */ signal on error name iserr ; signal on syntax name iserr say say " "cy_ye"GrabSite -- GET a set of linked documents from a WWW site"normal say if logfile=0 | logfile=' ' then do logfile=0 end /* do */ else do aa=stream(logfile,'c','query exists') if aa<>'' then do foo=sysfiledelete(logfile) if verbose>0 then say "Old logfile deleted: "logfile call lineout logfile,'GrabSite log file. Created '||time('n')||' '||date('n') end /* do */ end /***** determine file/url to read, and other info */ say jump1: nop if afile='' then do afile=getstring("Home page to grab, or enter ? for a brief description.",'?',reverse' 1)'normal) if afile='?' then do call helpme1 afile='' signal jump1 end end afile=strip(afile) afileu=translate(strip(afile)) if abbrev(afileu,'FILE://') then do parse var afile . '://' afile afileu=translate(afile) afile_isurl=0 if stream(afile,'c','query exists')="" then do call printsay "No such file: "afile exit end /* do */ fgoo=printsay( " ... reading "||cutstrg(afile,50) "....", , " ... reading "||afile|| "....") stuff=charin(afile,1,chars(afile)) afile_isurl=0 if baseurl='' then baseurl=getstring("Default site (the dotted ip address)",defbaseurl,reverse' 1a)'normal) if rootdir='' then rootdir=getstring("Default 'root' directory ",defrootdir,reverse' 1b)'normal) end /* local file as base */ else do /* it's a url */ if abbrev(afileu,'HTTP://')<>1 then do afile='http://'afile afileu=translate(afile) end /* do */ afile_isurl=1 parse var afile . '://' bb1 '/' bb2 baseurl=bb1 ii=lastpos('/',bb2) if ii=0 then rootdir='/' else rootdir=left(bb2,ii) end /* url entry */ if pos('://',baseurl)=0 then baseurl='http://'||baseurl rootdir=strip(rootdir) if rootdir<>'/' then rootdir='/'||strip(rootdir,,'/')||'/' baseurl=strip(strip(baseurl,'t','/')) /* destination directory */ atdestdir: nop if destdir='' then do destdir=getstring("Enter a destination directory ",directory(),reverse' 2)'normal) if destdir="?" then do call helpme1 destdir='' signal atdestdir end destdir=strip(destdir) if substr(destdir,2,1)<>':' & abbrev(destdir,'\')=0 then do dd=directory() if destdir='' then destdir=strip(dd,,'\') else destdir=strip(dd,,'\')||'\'||destdir end didit=sysmkdir2(destdir,1) if didit<>0 then do say "Could not access, or create, "destdir exit end end else do destdir=strip(strip(destdir),'t','\')'\' didit=sysmkdir2(destdir,1) if didit<>0 then do say "Could not access, or create, "destdir exit end /* do */ end destdir=strip(strip(destdir),'t','\')'\' /* get and set includers variables */ call get_includers /* Quick/skeleton mode */ getquick:nop if cmdline<>1 then do do until write_all<>'' al=getstring(' HTML documents only (Yes, No, or ? for help)','N',reverse' 5)'normal) al=strip(translate(al)) if al='?' then do al='' call help_writeall iterate end if abbrev(al,'N')=1 then write_all=1 else write_all=0 end if write_all=0 then call printsay "Ignoring non-html documents" else call printsay "Retrieving all links " say /* modify other parameters */ if write_all0='' then do if yesno(" Would you like to modify configuration parameters?")=1 then do call modify_config end /* do */ end end /************** Done with user input **********/ /******* copy file/url to destdir */ /* if local file, copy directly to destidr if url, then maybe copy relative to destdir */ if afile_isurl=0 then do /* local file -- jump start*/ ff=translate(afile,' ','\/') ff2=word(ff,words(ff)) ff2=destdir||ff2 say bold"Saving to "normal|| ff2 foo=translate(stream(ff2,'c','open write')) if foo<>'READY:' then do say "Could not open file for writing. Error was: " foo exit end /* do */ foo=charout(ff2,stuff,1) if foo<>0 then do say "Error. Problem writing file " exit end /* do */ foo=stream(ff2,'c','close') foo=urls_in(stuff,baseurl,rootdir,afile,1) if verbose>0 then do foo=printsay(" ... done parsing "||cutstrg(afile,50),, " ... done parsing "afile) call printsay ' ' call printsay " " cy_ye " # links in "normal||bold||afile"="normal||" "||fileurls.0 end nparsed=1 end else do /* a url */ iurls=1 taref=strip(translate(afile)) if length(taref)<30 then do uaref=taref end else do p2=right(taref,30) p1=c2x(stringcrc(taref)) uaref=p1||'_'||p2 end /* uaref=translate(strip(afile)) */ flist.uaref=1 flist.0=1 fileurls.iurls=afile fileurls.iurls.!ref=' (user specified)' fileurls.iurls.!id=aref fileurls.0=iurls end fileurls.1.!depth=0 /**** get a robot.txt file first? */ if robot_check=1 then do aurl=baseurl'/robots.txt' rlist=get_url(aurl) exclist=add_robot(rlist) if verbose>0 then do call printsay "Excluding: "exclist ; call printsay ' ' end exclist.0=0 if exclist<>'' then do do ii=1 to words(exclist) exclist.ii=translate(strip(word(exclist,ii))) end /* do */ exclist.0=words(exclist) end /* do */ end /* build exclist. */ /************ Get urls in first file/url */ call printsay ' ' if write_all=0 then do if length(afile)<40 then do call printsay ' Examining html links starting from:'||bold||afile||normal end else do call printsay ' Examining html links starting from...' call printsay ' :'||bold||afile||normal end end else do if length(afile)<40 then do call printsay ' Examining links starting from:'||bold||afile||normal end else do call printsay ' Examining links starting from...' call printsay ' :'||bold||afile||normal end end call printsay ' ' /********** now get the urls, parse, add to list.... */ mm=0 do forever mm=mm+1 if mm>fileurls.0 then leave goob=fileurls.mm f1f_orig=goob f1f=goob isdepth=fileurls.mm.!DEPTH /* the depth of the resource (That might be parsed */ if length(f1f)>40 then f1f='...'right(goob,36) oof='' if verbose>0 then oof=' -- '||filespec('n',fileurls.mm.!ref) parse var goob . '://' bb1 '/' asel aselorig=asel /* check asel for ../ constructions, and fix if found */ do forever if pos('/../',asel)=0 then leave /* no ../ to remove */ parse var asel p1 '/../' p2 ip0=lastpos('/',p1) if ip0=0 then do asel='..' leave /* i.e.; http://foo.bar.net/../ are disallowed */ end /* do */ asel=left(p1,ip0)||p2 end /* do */ if asel='..' then do /* .. signals an error */ if verbose>1 then foog=printsay( "Skipping #"bold||mm||normal||" (too many /../) "||f1f||oof,, "Skipping #"bold||mm||normal||" (too many /../) "||f1f_orig||' '||oof) iterate end if asel<>aselorig & verbose>1 then call printsay "... using " asel goob2=translate('HTTP://'||bb1||'/'||asel) baseurl=bb1 ii=lastpos('/',asel) if ii=0 then rootdir='/' else rootdir=left(asel,ii) if pos('://',baseurl)=0 then baseurl='http://'||baseurl rootdir=strip(rootdir) if rootdir<>'/' then rootdir='/'||strip(rootdir,,'/')||'/' baseurl=strip(strip(baseurl,'t','/')) if robot_no(asel)=1 then do if verbose>1 then foog=printsay( "Skipping #"bold||mm||normal||" (robot exclusion) "||f1f||oof ,, "Skipping #"bold||mm||normal||" (robot exclusion) "||f1f_orig||oof) iterate end if includer2<>"" then do /* only GET if in/under this directory */ if abbrev(goob2,includer2)=0 then do if verbose>1 then foog=printsay("Skipping #"bold||mm||normal||" (not in dir) "||f1f||oof,, "Skipping #"bold||mm||normal||" (not in dir) "||f1f_orig||oof) iterate end end if nocgi=1 then do /* cgi? then skip */ ccok=1 do kk=1 to words(nocgi_strings) if ccok=0 then leave acc=strip(translate(word(nocgi_strings,kk))) gloop=pos(acc,translate('/'asel)) if gloop>0 then do ccok=0 end end if ccok=0 then do if verbose>1 then foog=printsay("Skipping #"bold||mm||normal||" (CGI) "||f1f||oof ,, "Skipping #"bold||mm||normal||" (CGI) "||f1f_orig||oof ) iterate end end if nosearch=1 then do /* skip "search string" calls (usually to scripts*/ if pos('?',asel)>0 then do if verbose>1 then goog=printsay("Skipping #"bold||mm||normal||" (contains ?) "||f1f||oof,, "Skipping #"bold||mm||normal||" (contains ?) "||f1f_orig||oof) iterate end end if skip_prefix<>'' then do /* ignore if starts with this? */ if abbrev(asel,skip_prefix)=1 then do if verbose>1 then goof=printsay("Skipping #"bold||mm||normal||" (contains "skip_Prefix") "||f1f||oof,, "Skipping #"bold||mm||normal||" (contains "skip_Prefix") "||f1f_orig||oof) iterate end end ara=lastpos('.',asel);anext='' if ara>0 then do /* check for html type of extentsion*/ anext=translate(strip(substr(asel,ara+1))) end if htmlexts<>'' & write_all<>1 then do /* only get possible htmls */ if pos(anext,htmlexts)=0 then iterate end if not_htmlexts<>"" & write_all<>1 then do /* don't get almost certainly NOT htmls */ if pos(anext,not_htmlexts)>0 then iterate end if verbose>-1 then foog=printsay("Checking "bold||mm||normal||" of "fileurls.0")"||f1f||oof,, "Checking "bold||mm||normal||" of "fileurls.0")"||f1f_orig||oof) /* get the url */ stuff=get_url(goob,,verbose,reqheaders) if stuff<>'' then do goof=printsay(" .... done GETting "||cutstrg(goob,50),, " .... done GETting "||goob) end else do if is_timeout=1 then do goof=printsay("Timeout on #"bold||mm||normal||f1f||oof,, "Timeout on #"bold||mm||normal||f1f_orig||oof) noconnects=noconnects+1 iterate end else do goof=printsay("Skipping #"bold||mm||normal||" (problem GETting) "||f1f||oof,, "Skipping #"bold||mm||normal||" (problem GETting) "||f1f_orig||oof) noconnects=noconnects+1 iterate end end ngets=ngets+1 call extracts /* extract body and head */ /* look for return code */ parse var response_line . icode . r1=left(response_code,1) if r1=4 | r1=5 | r1=1 then do /* error response */ n400s=n400s+1 iterate end /* get the content-type */ ss='!CONTENT-TYPE' if translate(headers.ss)<>'TEXT/HTML' then do /* not html -- don't parse */ if write_all=1 then call url_to_file goob2 /* but possibly save to disk */ iterate /* don't bother parsing this */ end /* does it satisfy the INCLUDER test? */ ii=use_me(goob2,isdepth) if ii=0 then do call url_to_file goob2 iterate /* don't bother parsing this */ end /* extract links, but first write it to disk */ call url_to_file goob2 if result=0 then iterate /* if here, extract urls and add to list */ eek=fileurls.0 if verbose>0 then foof=printsay(" .... parsing "||cutstrg(goob,50),, " .... parsing "||goob) if r1=3 then do /* redirect -- extract location header */ ss='!LOCATION' asd=strip(headers.ss) if asd<>'' then do stuff=stuff||' ' /* convert location header to link (a small hack */ end end foo=urls_in(stuff,baseurl,rootdir,goob,isdepth+1) if verbose>0 then foog=printsay(" ... done parsing "||cutstrg(goob,50),, " ... done parsing "||goob) nparsed=nparsed+1 if verbose>1 then do if eek0 then call printsay " new links to check: "bold||(fileurls.0-eek)||normal end end /* ******* Read a url */ /**** Status info */ call printsay ' ' call printsay ' ------- Status: ' call printsay "Total number of unique URLs: "fileurls.0 call printsay "Total number retrieval attempts: " ngets '(400s='n400s'. No Connect='noconnects')' call printsay "Total number of parsed pages: "nparsed call printsay "Total number of files written: " nwritten '(files deleted='ndeleted')' call printsay " " call printsay "Reminder: files are written to "bold||destdir||normal if logfile<>0 then do say ' ** The log file is: ' logfile call lineout logfile end exit /********************/ /* test whether to parse this. Uses INCLUDER, DEPTH_ALSO, and INCLUDE_DEPTH */ use_me:procedure expose includer depth_also include_Depth parse arg goob2,mydepth /* depth, if enabled, can be sufficient */ if include_depth>0 then do if mydepth<=include_depth then return 1 end /* depth not satisfied. Check INCLUDER (in/under) test? */ if depth_also=0 then return 0 if includer='' then return 0 if abbrev(goob2,includer)=0 then return 0 return 1 /********/ /* modify configuration parameters */ modify_config: params="def_tofile htmlexts logfile not_Htmlexts overwrite robot_check reqheaders " params=params||"reqheaders verbose nocgi nocgi_strings nosearch remove_prefix skip_prefix" params=translate(params) say do forever aa=getstring("Select a parameter to modify (?=list,??=current values, X=done)","?",reverse" -->"normal) if aa="?" then do say say " "reverse"Configuration Parameters: "normal say bold" DEF_TOFILE"normal"= default filename, used when a URL does not contain a filename" say bold" HTMLEXTS"normal"= HTML extensions (if quick mode selected, only files with these " say " extensions are retrieved)" say bold" LOGFILE"normal"= Name of logfile (results are recorded here)" say bold" MAX_TIME"normal"= maximum time allowed to get a resouce, in seconds " say bold" NOCGI"normal"= If 1, do NOT retrieve URLs containing a NOCGI_STRINGS" say bold" NOCGI_STRINGS"normal"= List of strings that signify CGI script, or other types" say " of URLs to skip" say bold" NOSEARCH"normal"= If 1, do NOT retrieve URLs that end with a ?xxxx " say bold"NOT_HTMLEXTS"normal"= non-HTML extensions (if quick mode selected, files with these " say " extensions are ignored)" say bold" OVERWRITE"normal"= If 1, then overwrite preexisting files " say bold"REMOVE_PREFIX"normal"= If the URL's path starts with this, then trim the " say " beginning of the path (remove everything up to the first /) " say bold" SKIP_PREFIX"normal"= If the URL's path starts with this, then skip it " say bold" VERBOSE"normal"= If 1, verbose mode " say iterate end /* do */ if aa="??" then do say say " "reverse"Current values of configuration Parameters: "normal say bold" DEF_TOFILE"normal"= "def_tofile say bold" HTMLEXTS"normal"= "htmlexts say bold" LOGFILE"normal"= "logfile say bold" MAX_TIME"normal"= "max_time say bold" NOCGI"normal"= "nocgi say bold" NOCGI_STRINGS"normal"= "nocgi_STRINGS say bold" NOSEARCH"normal"= "nosearch say bold"NOT_HTMLEXTS"normal"= "not_htmlexts say bold" OVERWRITE"normal"= "overwrite say bold"REMOVE_PREFIX"normal"= "remove_Prefix say bold" REQHEADERS"normal"= "reqheaders say bold" ROBOT_CHECK"normal"= "robot_check say bold" SKIP_PREFIX"normal"= "skip_prefix say bold" VERBOSE"normal"= "verbose say say "Note: you can permanently change these values by editing GRABSITE.CMD" say iterate end /* do */ aa=translate(strip(aa)) if aa='X' then leave if wordpos(aa,params)=0 then do say "No such parameter: " aa end /* do */ else do aaold=value(aa) bb=getstring("Enter new value for "aa,aaold,bold" --->"normal) foo=value(aa,bb) end end return 0 /********/ /* get and set includer and includers2 */ get_includers: include2: nop if includer2='' then do includer2=getstring(" Only GET (& save) urls that begin with ",'/',reverse' 3)'normal) end if includer2="?" then do call help_includer includer2='' signal include2 end includer2=translate(includer2) if includer2='' then includer2=baseurl||'/' else includer2=baseurl||'/'||strip(includer2,'l','/') include1: nop if includer='' then do includer=getstring(" Only GET & process urls in or under ",rootdir,reverse' 4a)'normal) end if includer="?" then do call help_includer includer='' signal include1 end includer=translate(includer) include1a: nop include_depth=getstring(' GET & process urls at this DEPTH (from base-URL) ',, include_depth,reverse' 4b)'normal) if include_depth="?" then do call help_includedepth include_depth=0 signal include1a end if datatype(include_depth)<>'NUM' then do say "The DEPTH parameter must be an integer " signal include1a end include1b: nop if include_Depth<>0 then do do forever depth_also=getstring('use in addition to (A), or instead (N), of the URLS in or under test',, depth_also,reverse' 4b.i)'normal) depth_also=translate(strip(depth_also)) if depth_also="?" then do call help_includedepth depth_also='A' iterate end if abbrev(depth_also,'N')=1 then depth_also=0 else depth_also=1 leave end end if includer='' then includer=baseurl||rootdir else includer=baseurl||'/'strip(includer,'l','/') say includer=translate(includer) includer2=translate(includer2) if abbrev(translate(includer),translate(includer2))=0 then do call printsay "Error: processed URLS are NOT a subset of GET urls " signal include2 end if length(includer2)<50 then do call printsay "Only GETting URLs in/under: "includer2 end else do call printsay "Only GETting URLs in/under ... " call printsay " : "includer2 end if include_depth>0 then do call printsay "Process URLS within "include_depth" steps of base-URL" end if depth_also=1 | include_depth=0 then do if length(includer)<50 then do if include_depth=0 then call printsay "Process URLs in/under: "includer else call printsay " also process URLs in/under: "includer end else do if include_depth=0 then do call printsay "Process URLs in/under... " call printsay " : "includer end else do call printsay " also process URLs in/under... " call printsay " : "includer end end end len_includer2=length(includer2) say return 0 /**************************************************/ /* copy a url to a file */ url_to_file: parse arg afil goob2=translate(afil) if includer2<>"" then do /*relative to includer2 directory */ tofile=substr(goob2,len_includer2) end else do parse var afil . '://' . '/' tofile end /* do */ if tofile='' then do call printsay " ERROR: could not write " afil call printsay " ("goob")" return 0 end /* do */ if tofile='' | right(tofile,1)='/' then tofile=tofile||def_tofile /* save to destidr */ tofile=translate(tofile,'\','/') tofile=strip(strip(tofile),'l','\') tofile2=destdir||tofile todir=filespec('d',tofile2)||filespec('p',tofile2) mkit=sysmkdir2(todir) yow=stream(tofile2,'c','query exists') if yow<>'' then do if overwrite=2 then do if verbose>-1 then call printsay " "||cy_ye||tofile2||normal " old version used." return 1 /* use old copy */ end if overwrite=1 then do if verbose>0 then call printsay " .... deleting "tofile2 foo=sysfiledelete(tofile2) ndeleted=ndeleted+1 end else do call printsay " > "tofile2 " exists; "bold"skipping "normal return 0 end end foo=stream(tofile2,'c','open write') wow=charout(tofile2,stuff,1) if wow<>0 then do call printsay " ERROR: could not write " tofile2 call printsay " ("goob")" return 0 end foo=stream(tofile2,'c','close') if foo="READY:" then do if verbose>-2 then call printsay " "||cy_ye||tofile2||normal||stream(tofile2,'c','query size')||" bytes written." end nwritten=nwritten+1 return 1 /* sets globals */ /********************/ /* search a file, find IMG SRC= and A HREF= urls. Add BASEURL if no / or http://.../ at beginning of URL */ urls_in:procedure expose fileurls. flist. remove_prefix bold normal logfile reverse cy_ye parse arg stuff, baseurl,rootdir,stuffname,isdepth /* remove comments */ body="" do forever /*no comments within comments are allowed */ if stuff="" then leave parse var stuff t1 '' stuff body=body||t1 end /* do */ stuff=body body='' if verbose=1 then call printsay "Parsing "||length(stuff)||' characters' /* find all IMG SRC= and A HREF=, FRAME= throw away internal links */ do until stuff="" parse var stuff . '<' anarg '>' stuff aref=afindsrc(anarg) if aref='' then iterate taref=translate(aref) if abbrev(taref,'MAILTO:')=1 then iterate /* only keep http */ if abbrev(taref,'FTP:')=1 then iterate if abbrev(taref,'GOPHER:')=1 then iterate /* fix up name to be fully qualified url */ select when abbrev(taref,'HTTP://')=1 then nop when abbrev(taref,'/')=1 then aref=baseurl||aref otherwise aref=baseurl||rootdir||aref end /* check for remove_prefix entries */ if remove_prefix<>'' then do parse var aref a1 '://' a2 '/' aaurl if abbrev(translate(aaurl),translate(remove_prefix))=1 then do parse var aaurl . '/' aaurl aref=a1'://'a2'/'aaurl if verbose=1 then call printsay " > " remove_prefix "removal yields: "aref end end /* record this entry only if not yet recorded -- else, just increment counter */ taref=strip(translate(aref)) if length(taref)<30 then do uaref=taref end else do p2=right(taref,30) p1=c2x(stringcrc(taref)) uaref=p1||'_'||p2 end if datatype(flist.uaref)<>'NUM' then flist.uaref=0 flist.uaref=1+flist.uaref flist.0=flist.0+1 if flist.uaref=1 then do iurls=fileurls.0+1 fileurls.iurls=aref fileurls.iurls.!ref=stuffname fileurls.iurls.!depth=isdepth fileurls.iurls.!id=uaref fileurls.0=iurls end end return iurls /*****************/ /* get a string from user */ getstring:procedure expose normal bold reverse logfile cy_ye parse arg prompt,def,prompt0 abold=bold if bold="BOLD" then abold='' anormal=normal if normal='NORMAL' then anormal='' l1=length(prompt) l2=length(def) if l1+l2>38 then do say prompt0' 'abold||prompt||anormal if l2>22 then do say ' (ENTER='abold||def||normal')' call charout, bold" ? "normal parse pull ans end /* do */ else do call charout,' (ENTER='abold||def||anormal')? ' parse pull ans end end else do call charout,prompt0' 'bold||prompt||normal' (ENTER='abold||def||anormal')? ' parse pull ans end if ans='' then ans=def return ans /* --- Load the function library, if necessary --- */ loaddll: if RxFuncQuery("SockLoadFuncs")=1 then do /* already there */ call RxFuncAdd "SockLoadFuncs","rxSock","SockLoadFuncs" call SockLoadFuncs end foo=rxfuncquery('sysloadfuncs') if foo=1 then do call RxFuncAdd 'SysLoadFuncs', 'RexxUtil', 'SysLoadFuncs' call SysLoadFuncs end /**** foo=rxfuncquery('rexxlibregister') if foo=1 then do call rxfuncadd 'rexxlibregister','rexxlib', 'rexxlibregister' call rexxlibregister end foo=rxfuncquery('rexxlibregister') if foo=1 then do say " Could not find REXXLIB " exit end ***/ ansion=checkansi() if ansion=1 then do aesc='1B'x cy_ye=aesc||'[37;46;m' normal=aesc||'[0;m' bold=aesc||'[1;m' re_wh=aesc||'[31;47;m' reverse=aesc||'[7;m' end else do say " Warning: Could not detect ANSI.... output will look ugly ! " cy_ye="" ; normal="" ; bold="" ;re_wh="" ; reverse="" end /* Do */ return 1 /* -------------------- */ /* get a yes or no , return 1 if yes */ yesno:procedure expose normal reverse bold logfile cy_ye parse arg fooa , allopt,altans if altans<>" " & words(altans)>1 then do w1=strip(word(altans,1)) w2=strip(word(altans,2)) a1=left(w1,1) ; a2=left(w2,1) a1a=substr(w1,2) ; a2a=substr(w2,2) end else do a1='Y' ; a1a='es' a2='N' ; a2a='o' end /* Do */ ayn=' '||bold||a1||normal||a1a||'\'||bold||a2||normal||a2a if allopt=1 then ayn=ayn||'\'||bold||'A'||normal||'ll' do forever foo1=normal||reverse||fooa||normal||ayn call charout, foo1 normal ':' pull anans if abbrev(anans,a1)=1 then return 1 if abbrev(anans,a2)=1 then return 0 if allopt=1 & abbrev(anans,'A')=1 then return 2 end nocon: if rc=-7 then return 0 exit 0 /* ------------------------------------------------------------------ */ /* function: Check if ANSI is activated */ CheckAnsi: PROCEDURE thisRC = -1 trace off /* install a local error handler */ SIGNAL ON ERROR Name InitAnsiEnd "@ANSI 2>NUL | rxqueue 2>NUL" thisRC = 0 do while queued() <> 0 queueLine = lineIN( "QUEUE:" ) if pos( " on.", queueLine ) <> 0 | , /* USA */ pos( " (ON).", queueLine ) <> 0 then /* GER */ thisRC = 1 end /* do while queued() <> 0 */ InitAnsiEnd: signal off error RETURN thisRC /*************************/ /* return 1 if adir is an existing (possibly empty) directory , 0 if not */ dosisdir2:procedure parse arg adir adir=strip(adir) adir=strip(adir,'t','\') nowdir=directory() nowdrive=filespec('d',nowdir'\') nowpath=filespec('p',nowdir'\') adr=filespec('d',adir) if adr='' then do if abbrev(adir,'\')=0 then adir=nowdrive||nowpath||adir else adir=nowdrive||adir end /* do */ foo=sysfiletree(adir,goo,'D') if goo.0>0 then return 1 return 0 /*************************************/ /* parse GETten stuff to globals response_line = the response line response_code = the 200, 401, etc. code headers. = list of response headers stuff = the contents (the file) */ extracts: cr='0a'x parse var stuff response_line (cr) stuff parse var response_line . response_code . response_line=strip(response_line,,'0d'x) headers.0='' do forever parse var stuff ahead (cr) stuff ahead=strip(ahead,,'0d'x) if ahead='' then leave parse var ahead name ':' aval nn=translate('!'||name) headers.0=headers.0' 'nn headers.nn=aval end /* do */ /* remove html comments */ return 1 /* ------------- */ /* create a directory, arbitrarily deep. Returns 0 if succes, otherwise returns an error code adir: directory to create -- must be fully qualified. verbose: if 1, will write some status stuff to screen */ sysmkdir2:procedure parse arg adir,verbose adir=strip(adir,'t','\') if dosisdir2(adir)=1 then do /* already exists */ if verbose=1 then say " Using pre-existing directory: "adir return 0 end /* do */ ff=sysmkdir(adir) if ff=0 then return ff /* make the tree */ f2=adir'\' dd=filespec('d',f2) pp=filespec('p',f2) if pp='\' | pp='' then return -1 pp2=strip(translate(pp,' ','\')) do mm=1 to words(pp2) a1=subword(pp2,1,mm) a1=translate(a1,'\',' ') dd2=dd'\'a1 hoo=sysmkdir(dd2) if hoo=0 & verbose=1 then call printsay ' ... creating: 'dd2 end /* do */ return hoo /****************/ /* URL and DESTDIR help info */ helpme1: say say bold"GrabSite"normal" is designed to copy a WWW site to your local hard disk. " say say "It's easy to use: just specify a URL, and then specify a directory" say "on your hard drive to copy the web pages (and other files) retrieved" say "from this WWW site." say say "For example: suppose the 'home page' is" say " www.coolstuff.org/games/expert.htm" say "and the 'destination directory' is:" say " d:\localweb\game10 " say "Then..." say " a) GrabSite will GET (using socket calls) the /games/expert.htm HTML " say " document at www.coolstuff.org." say " b) A copy of /games/expert.htm will be written to d:\localweb\games10 " say " c) /games/expert.htm will be scanned for links " say " d) For each link found, repeat step a (changing names appropriately)" say say "Note: For hints on running from command line, run GrabSite with a ? argument." say" Example: D:>GrabSite ? " say call charout,reverse"Hit any key to continue "normal foo=sysgetkey('noecho') say return 1 /****************/ /* INCLUDER help info */ help_Includer: say say "You can, and should, limit the scope of "bold"GrabSite"normal"'s WWW downloads" say "(If you don't, you could end up downloading a significant chunk of the WWW!)" say say "There are two several used to limit scope: " say say " a) Limiting what URLS are "bold"downloaded"normal", but "bold"not"normal" examined." say " URLS that pass this test are downloaded (and saved to disk)." say " They are "bold"not"normal" parsed -- links they may contain are ignored." say say " b) Limiting what URLS are "bold"downloaded"normal" and "bold"examined"normal"." say ' URLS that pass test a), and that have a content-type of text/html, and ' say " that ALSO pass this test are retrieved, saved to disk, and parsed -- " say " '' then say "Only retrieve links ending with: "htmlexts else say "Retrieve links that do NOT end with: "not_htmlexts say say cy_ye" Note: Configuration hint:"normal say " You can modify this rule by changing the HTMLEXTS and NOT_HTMLEXTS parameters" say call charout,reverse"Hit any key to continue "normal foo=sysgetkey('noecho') say return 1 /***************/ /* cut length of string to nn characters, if necessary */ cutstrg:procedure parse arg astr,ilen if ilen='' then return astr if length(astr)0 then call lineout logfile,aval2 return 0 /***********************************/ /* search a file, find IMG SRC=, FRAME SRC=, and A HREF= urls. Add BASEURL if no / or http://.../ at beginning of URL Return results in hrefs. and imgs. */ afindsrc:procedure parse arg anarg parse var anarg htype stuff htype=translate(strip(htype)) anarg=translate(anarg,' ','0d0a0900'x) /* find all FRAME SRC=, IMG SRC= and A HREF=, throw away internal links */ chklist='BODY IMG A FRAME AREA EMBED LINK APPLET ' anctype=wordpos(htype,chklist) if anctype=0 then return '' /* not a url containing element */ /* depending on anctye, look for different things */ select when anctype=1 then do /* body background */ do forever if anarg='' then return '' /* nothing found */ parse var anarg a1 anarg ; a1=strip(a1) if abbrev(translate(a1),'BACKGROUND=')=0 then iterate parse var a1 . '=' gotimg . ; gotimg=strip(strip(gotimg),,'"') return gotimg end /* do */ end /* i3>0 */ when anctype=2 then do /* img */ do forever if anarg='' then return '' parse var anarg a1 anarg ; a1=strip(a1) if abbrev(translate(a1),'SRC=')=0 then iterate parse var a1 . '=' gotimg . ; gotimg=strip(strip(gotimg),,'"') return gotimg end /* do */ end when anctype=3 | anctype=5 | anctype=7 then do /* A AREA LINK */ do forever if anarg='' then leave parse var anarg a1 anarg ; a1=strip(a1) if abbrev(translate(a1),'HREF=')=0 then iterate parse var a1 . '=' gothref . ; gothref=strip(strip(gothref),,'"') parse var gothref gothref '#' . /* toss out internal jumps */ if gothref="" then return "" if abbrev(translate(gothref),'JAVASCRIPT:') then return "" /* don't do "javascript:" entries */ return gothref end /* do */ end when anctype=4 | anctype=6 then do /* FRAME EMBED */ do forever if anarg='' then leave parse var anarg a1 anarg ; a1=strip(a1) if abbrev(translate(a1),'SRC=')=0 then iterate parse var a1 . '=' gothref . ; gothref=strip(strip(gothref),,'"') parse var gothref gothref '#' . /* toss out internal jumps */ if gothref="" then return "" return gothref end /* do */ end when anctype=8 then do /* APPLET */ abase=''; aref='' do forever if anarg='' then leave parse var anarg a1 anarg ; a1=strip(a1) if abbrev(translate(a1),'CODE=') + , abbrev(translate(a1),'CODEBASE=')=0 then iterate if abbrev(translate(a1),'CODEBASE=')=1 then do parse var a1 '"' abase '"' . end /* do */ else do /* CODE */ parse var a1 '"' aref '"' end /* do */ if aref<>'' & abase<>'' then leave end if aref='' then return '' /* no CODE= found */ if abase<>'' then aref=strip(abase,'t','/')||'/'||strip(aref,'l','/') return aref end otherwise return '' end /* select */ return '' /***********/ /* remove substring */ removestrg:procedure parse arg aval,astr if pos(astr,aval)=0 then return aval aa='' do forever if aval='' then leave parse var aval a1 (astr) aval aa=aa||a1 end return aa /**************/ /******************************/ /* parse a robots.txt file, The algorithim: 1 ignore # lines (comments) 2a look for user-agent: grabsite lines 2b if none, look for user-agent:* lines 3 if 2a or 2b don't match, then no robot disallows exist 4 otherwise, from the look for disallow lines going starting from the user-agent line, until the first empty line (use 0a as line delimiter, and throw away the 0d) 5 add from each disallow: asel to exclusion_list --------------- # samples robots.txt -- will add cgi-* to exclusion_list user-agent: mozilla Disallow: /samples Disallow: /stuff/ #user-agent: checklink user-agent:gizmo disallow:fes/ user-agent:* disallow:cgi- --------------- */ add_robot:procedure expose verbose parse arg abody parse var abody . icode . if left(strip(icode),1)<>2 then return '' /* not 200 code, so no disallows */ cr='0a'x do forever /* get rid of response header */ if abody='' then return '' /* nothing in body */ parse var abody al1 (cr) abody al1=strip(al1,,'0d'x) if al1='' then leave /* found empty line*/ end nn=0 do forever if abody='' then leave parse var abody al1 (cr) abody al1=strip(al1,,'0d'x) if al1='#' then iterate parse var al1 al1a '#' . nn=nn+1 lins.nn=al1a end if nn=0 then return '' /* no entries, return */ lins.0=nn /* look for GRABSITE, or *, user-agent */ iat=0 do mm=1 to lins.0 al=strip(lins.mm) if abbrev(translate(al),'USER-AGENT')=0 then iterate parse var al . ':' dagent ; dagent=translate(strip(dagent)) if abbrev(dagent,'CHECKLINK')=1 then do iat=mm leave end if dagent='*' then do iat=mm end /* do */ end /* do */ exlist2='' if iat=0 then return ' ' /* no matching user-agent */ do mm=iat+1 to lins.0 al=translate(strip(lins.mm)) if al='' then leave /* blank line signals end of "record" */ if abbrev(al,'DISALLOW')<>1 then iterate parse var al . ':' dasel ; dasel=strip(dasel) exlist2=exlist2||' '||strip(dasel,'l','/') end /* do */ return exlist2 /*******************/ /* compare arg against "robot" exclist. -- return 1 if a match */ robot_No:procedure expose exclist. parse upper arg asel asel=strip(asel,'l','/') do mm=1 to exclist.0 tt=exclist.mm if abbrev(asel,tt)=1 then return 1 end /* do */ return 0 /* - ------------------------ */ /* front end to get_url_adv. Return '' if some kind of error; else return contents */ get_url:procedure expose max_time logfile bold normal reverse cy_ye is_timeout parse arg aurl,maxchar,verbose,headers is_timeout=0 maxsize=maxchar type='GET' if abbrev(translate(aurl),'HTTP://')=0 then do if verbose>0 then call printsay "Error: URL not properly specified (it must begin with HTTP://)" return '' end parse var aurl . '://' server '/' req mehost='' upwd='' mhdrs=headers qs='' trn_id='' sock_time='' stuff=get_url_adv(max_time,maxsize,type,server,req,mehost,upwd,mhdrs, , verbose,qs,trn_Id,sock_time) parse var stuff astat aip '0d0a'x stuff /* put timeout message here */ is_timeout=1 if strip(astat)<>0 then return '' return stuff /* ==================================== */ /* GET a URL, with timeouts. See bottom of file for syntax */ get_url_adv:procedure expose logfile bold normal reverse cy_ye /** ---- begin user configurable parameters ---- */ /* Default Number of seconds wait on a socket. This is NOT the total time for a complete response, It's the amount of time to wait for a response from a single socket call. It can be overridden by the sock_time argument */ sock_time_def=30 /** ---- end user configurable parameters ---- */ parse arg tottime,totsize,type,server,request,mehost,upwd,moreheaders, , verbose,qs_info,transaction,sock_time if tottime='VERSION' then return '1.11d' if sock_time='' then sock_time=sock_time_def parse var qs_info myqueue mysem . myqueue=strip(myqueue) ; mysem=strip(mysem) stuff=bget_url(type,server,request,upwd) if myqueue='' then return stuff /* called as proc */ a10=transaction||' '||stuff /* else, called as daemon */ foo=rxqueue('s',myqueue) queue a10 exit " " /* end thread */ /* jump here if socket is killed or other such error. */ if myqueue='' then return ' ' /* called as proc */ a10=transaction foo=rxqueue('s',myqueue) queue a10 iserr2: exit "" /************************/ /* this gets a url , and uses the "sockin" procedure to avoid hangs returns error_code,stuff error codes: 6 = other error 4 = sockgethostbyname error 1 = ioctl error 2 = connection error 3 = problem encountered in sockrecv 0 = no error */ bget_url:procedure expose verbose mehost sock_time totsize tottime mysem myqueue moreheaders , logfile bold normal reverse cy_ye parse arg type,server,request,upwd parse var server server ',' servern ',' proxy family ="AF_INET" crlf="0d0a"x ; maxchar=1000000000 if tottime='' | tottime=0 then tottime=30 /* 30 second default */ if datatype(tottime)<>'NUM' then tottime=30 /****** if no proxy.... */ if proxy='' then do serv.0addr=' ' if servern<>'' then do servern=strip(servern) if verify(servern,'1234567890.')=0 then serv.0addr=servern /* use supplied number */ end httpport=80 parse var server server ":" bport if bport<>"" then httpport=bport rc=1 if serv.0addr='' then do if verify(server,'1234567890.')>0 then do /* non-numeric address */ rc=sockgethostbyname(strip(server), "serv.0") if verbose>2 & rc<>0 then call printsay 'DNS lookup of 'server "==="serv.0addr end else do serv.0addr=strip(server) end end if rc=0 then do if verbose>1 then call printsay ' GET_URL_ADV sockin error in sockgethostbyname 'rc '('server return "4" end end /* or there is a proxy? */ if proxy<>'' then do serv.0addr=' ' httpport=80 parse var proxy proxy ":" pport if pport<>"" then httpport=pport rc=1 if verify(proxy,'1234567890.')>0 then do rc=sockgethostbyname(strip(proxy), "serv.0") if verbose>2 & rc<>0 then call printsay 'DNS lookup of 'server "=="serv.0addr end else do serv.0addr=strip(proxy) end if rc=0 then do if verbose>2 then call printsay ' GET_URL_ADV sockin error in sockgethostbyname 'rc '('proxy return "4" end /* and change request to include the original server */ request=strip(request,"l","/") request='http://'||server||'/'||request /* and possibly strip port off of server (server will be used in Host: header */ parse var server server ":" . end /* end of proxy */ /* dns request? */ if type="DNS" then do dd=serv.0addr if httpport<>80 then dd=dd||':'||httpport if verbose>2 then call printsay 'DNS lookup: 'server "=="serv.0addr adot=left(dotserver,20,' ') return '0 '||adot||'0d0a'x||dd end if mysem<>'' then do aa=eventsem_query(mysem) if aa<1 then do adot=left(dotserver,20,' ') if verbose>2 then call printsay "Semaphore cancel (1) = "aa return '0 '||adot||' ' end end /* this is the stuff we really need */ dotserver=serv.0addr gosaddr.0family=family gosaddr.0port =httpport gosaddr.0addr =dotserver gosock = SockSocket(family, "SOCK_STREAM", 0) select when type='HEADGET' then do type='GET' ; maxchar=999 ; end when type='DSCGET' then do type='GET' ; maxchar=1499 ; end otherwise do if totsize<>'' then do if datatype(totsize)<>'NUM' then totsize=100 if totsize<1 then totsize=100 maxchar=totsize*1000 end end end if proxy='' then do request=strip(request,"l","/") message=type" /"request" HTTP/1.0"crlf"HOST: "server||crlf end else do message=type" "request" HTTP/1.0"crlf"HOST: "server||crlf end message=message||"Referer: GrabSite@"||mehost||crlf if upwd<>"" then do parse var upwd username password . if password<>'' then do aa=sre_pack64_make(strip(username)':'strip(password)) message=message||'Authorization: Basic '||aa||crlf end end if moreheaders<>'' then do /* add more headers */ do until moreheaders='' parse var moreheaders hhdr '0d0a'x moreheaders hhdr=strip(translate(hhdr,' ','09001a'x)) if hhdr='' then iterate message=message||hhdr||crlf end end message=message||crlf got="" hh=Sockioctl(gosock,'FIONBIO',1) if hh= -1 then do call printsay 'crashed in ioctl 'errno return '1' end rc = SockConnect(gosock,"gosaddr.0") r.0=0;e.0=0;w.0=1;w.1=gosock rcx=sockselect("R.","W.","E.",sock_time) if rcx<=0 then do rc1=sockshutdown(gosock,2) rc = SockClose(gosock) if verbose>2 then call printsay " GET_URL_ADV error on connecting to " server "=" rcx return "2" end hh=Sockioctl(gosock,'FIONBIO',0) rc = SockSend(gosock, message) if rc<0 then do goo=socksock_errno() rc=sockshutdown(gosock,2) rc = SockClose(gosock) if verbose>1 then call printsay ' GET_URL_ADV sockin error ('gosock'): 'goo '(from:' server " "request return '2 '||goo end if mysem<>'' then do aa=eventsem_query(mysem) if aa<1 then do adot=left(dotserver,20,' ') if verbose>2 then call printsay "Semaphore cancel (2) = "aa return '0 '||adot||' ' end end YOW=bSOCKIN(GOSOCK,sock_time,maxchar,verbose,tottime,mysem) /* several vars are exposed */ parse var yow astat ',' got rc=sockshutdown(gosock,2) rc = SockClose(gosock) if astat=0 then do /* 1=success, 2=stop at maxlen */ if verbose>1 then call printsay ' GET_URL_ADV sockin error ('gosock'): 'got '(from:' server " "request return '3 '||got end if astat=7 then do /* 1=success, 2=stop at maxlen */ if verbose>1 then call printsay ' GET_URL_ADV sockin error ('gosock'): 'got '(from:' server " "request return '7 '||got end if verbose > 2 then call printsay " GET_URL_ADV: ("gosock") "type"; got" length(got) "bytes of response from:" server " "request adot=left(dotserver,20,' ') if astat=1 then return '0 '||adot||'0d0a'x||got else return '6 '||adot||'0d0a'x||left(got,min(maxchar,length(got))) /**************************/ /* bSOCKIN: a replacement for sockrecv. call as stuff=sockin(socket,timeout,maxlen,verbose,tottime) where: socket == a socket that's been established using sockconnect timeout == a timeout value in seconds maxlen == maximum length of message to recieve If not specified, then no maximum is imposed verbose == If 1, then report status messages. If not specified, then do NOT report status messages tottime == total amount of time allocated (across all sockrecv calls and stuff = the contents returned from the server (up to maxlen characters) or an error message Responses start with 1, Error messages start with 0,-1 or 2, or 7 Note: timeout refers to maximum seconds between "sockrecv" requests. It does NOT refer to total length of time required to recieve a message. Thus, a number of medium length delays (say, a few seconds required to complete each of several sockrecv requests) will NOT cause a timeout (in other words, the timeout counter is reset upon each successful completion of a 1000 byte sockrecv request). */ bSOCKIN:PROCEDURE expose logfile bold normal reverse cy_ye PARSE Arg socket,timeout,maxlen,verbose,tottime,mysem if maxlen=0 | maxlen='' then maxlen=100000000 if verbose>2 then call PRINTsay "Start read of socket " socket if Sockioctl(socket,'FIONBIO',1) = -1 then /* switch to nonblocking mode */ Return '0,'||'crashed in ioctl 'errno ict=0 ok=0 incoming='' asec1=time('s') timeout0=timeout /* reset timeout after each successsful socket call */ Do While TimeOut > 0 if mysem<>'' then do aa=eventsem_query(mysem) if aa<1 then do if verbose>2 then call printsay "Semaphore cancel (3) = "aa ok=-1 leave /* simulate timeout */ end end asec2=time('s') if (asec2tottime then return '7,total time too long in SOCKIN 'asec2 ','asec1','tottime res=Sockrecv(socket,'data',1000) if res=-1 then do /* error condition ? */ If errno <> 'EWOULDBLOCK' THEN do /* real crash ? */ Return '0,'||'crashed in sockrecv 'errno /* yes */ end /* not-fatal,no-data-available-condition: errno = EWOULDBLOCK & sockrecv returned -1 */ ict=ict+1 if verbose>2 then call PRINTsay 'SockIn: Waiting on 'socket' ('ict')...' Call SysSleep 1 TimeOut = TimeOut - 1; /* count down my timer */ Iterate; /* loop again */ End; /* if sockrecv = -1 error */ if res=0 then do ok=1 ; leave /* got end of message, so exit this do loop*/ end if res<0 then do /* non EWOULDBLOCK error */ return '0,'||" Error in sockrecv " rc end incoming=incoming||data if verbose>2 then call pRINTsay 'SockIn ('socket'): total data recieved: 'length(incoming) if length(incoming)>maxlen then do ok=2 leave end ict=0 /* reset display counter */ timeout=timeout0 /* reset timeout countdown */ End /* do while timeout > 0 */ /* here we are timed out, or got entire message, or violated maxlen */ if ok=1 then do /* got message */ If sockioctl(socket,'FIONBIO',0) = -1 then do /* switch to blocking mode */ if verbose>0 then call PRINTsay 'SockIn: ioctl error on switch to blocking mode: ' errno end return '1,'||incoming /* success! */ end if ok=2 then do /* maxlen violation */ if sockioctl(socket,'FIONBIO',0) = -1 then do /* switch to blocking mode */ if verbose>0 then call PRINTsay 'SockIn: ioctl error on switch to blocking mode: ' errno end Return '2,'||incoming end if ok=-1 then Return '0,'||' forced timeout on sockrecv' /* forced timed out via semaphore */ Return '0,'||' timeout on sockrecv' /* timed out (ok=0) */ err1: call printsay " error in get_url_adv at line "sigl ","rc return '5' /**************************************/ /* SIMPLE version, no longer used */ /* ---------------------------------------------*/ /* get a url from some site, return first maxchar characters (if maxchar missing, get 10million (the whole thing?) call as: stuff=get_url(aurl,maxchar,verbose,headers) where: aurl: the url to GET (required) the other 3 are optional: maxchar: max chars to get (default=10,000,000) verbose: verbose mode (default=OFF) headers: list of extra request headers, CRLF delimited */ /* ---------------------------------------------*/ old_get_url:procedure expose logfile bold normal reverse cy_ye parse arg aurl,maxchar,verbose,headers if maxchar="" then maxchar=10000000 got="" if abbrev(translate(aurl),'HTTP://')=0 then do if verbose>0 then call printsay "Error: URL not properly specified (it must begin with HTTP://)" return '' end parse var aurl . '://' server '/' request if VERBOSE>1 then do if length(server||request)<65 then do call printsay " GETting: " server ", " request end else do call printsay " GETting: " server " " call printsay " " request end /* do */ end /* do */ /* now get the url. This requires the RxSock.DLL be in your LIBPATH. */ /* Load RxSock */ if \RxFuncQuery("SockLoadFuncs") then nop else do call RxFuncAdd "SockLoadFuncs","rxSock","SockLoadFuncs" call SockLoadFuncs end crlf ='0d0a'x /* constants */ family ='AF_INET' httpport=80 rc=sockgethostbyname(server, "serv.0") /* get dotaddress of server */ if rc=0 then do call printsay ' Unable to resolve "'server'"' return 0 end dotserver=serv.0addr /* .. */ gosaddr.0family=family /* set up address */ gosaddr.0port =httpport gosaddr.0addr =dotserver gosock = SockSocket(family, "SOCK_STREAM", "IPPROTO_TCP") /* Set up request */ message="GET /"request' HTTP/1.0 'crlf||'Host: 'server||crlf if length(headers)>2 then do if right(headers,2)=crlf then headers=left(headers,length(headers)-2) end if headers<>'' then message=message||headers||crlf message=message||crlf got='' rc = SockConnect(gosock,"gosaddr.0") if rc<0 then do call printsay ' Unable to connect to "'server'"' return 0 end rc = SockSend(gosock, message) /* Now wait for the response */ do r=1 by 1 rc = SockRecv(gosock, "response", 1000) got=got||response if rc<=0 then leave tmplen=length(got) if tmplen> maxchar then leave end r rc = SockClose(gosock) return got /**************************************/ /* Description of GET_URL_ADV: Syntax: stuff=get_url_adv(maxtime,maxsize,type,server,req,mehost,upwd,mhdrs, , verbose,qs,trn_Id,sock_time) where ... maxtime = Optional. maximum time (in seconds) to allocate for this request. If not specified, 30 seconds is used. If the request takes longer then maxtime to complete, stop and return an error message maxsize = Optional. max length (in bytes) to allocate for this request. If not specified, infinite (actually, 100M) bytes is used. After maxsize, close connection and return an error message and what you got (these approx. 100k bytes). Type = type of request. Typically, HEAD or GET. However, three special proprietary types can be specified: DNS = lookup ip address of server. Return the IP address, perhaps with :port appended HEADGET = Use a GET request, but only return the HEAD (or the first 999 characters, whichever is less) DSCGET = Use a GET request, but only return the first 1499 characters Server = the server to send the request to. This can be a dns name, or an ipaddress Alternatively, you can use the following more complete syntax: ip_name,ip_address,proxy_address where: ip_name : the ip name (or number) to send the request to. If a non-80 port is used, append :port_number (i.e.; foo.bar.net:8082) ip_address : optional, the ip number for this ip name. If not included, a DNS request is done (thus, use of ip_address is a time saver), Do NOT append the ip port (if needed, it should be appended to ip_name) proxy_address : Ip address of this proxy. If possible, give the numeric address. If a name is given, then a DNS request is done (thus, use of a numeric address is a time saver) If the proxy is using a port other then 80, append :port_number to the proxy_address (even if it's numeric). For example: foo.bar.net:8080 or 152.15.22.16:8080 Notes: * If you are NOT using a proxy server, leave proxy_address blank. For example: foo.bar.net,189.12.51.62 * If you specify a proxy, then IP_address is ignored * ip_name is required, even if it's the same as the ip_address (it is used to form a Host: header) req : the request string. I.e.; the /mydir/index.sht portion of http://foo.bar.net/mydir/index.sht upwd: Optional. Username password, If username password, seperate them with a space. For example: 'Joe Xegi3' If specified, a Basic Authentication request header will be added. Alternatively you could add an explicit header, in mhdrs, using: aa=sre_pack64_make(strip(username)':'strip(passwd)) mdirs='Authentication: Basic '||aa mehost: Optional. the callers ip address. This is used in a REFERER header, mhdrs: optional. A crlf delimited list of headers, where each line has the form Header_NAME: a string verbose : Verbosity of intermediate output. Intermediate output is written to the screen and log file. 0 = NO intermediate output (the default) 1,2,3 signal various levels (minimal, some, too much) qs: Not required if a procedure call. A two component argument, containing: the queue to send results back on, and + the semaphore to use for forcing timeouts. This is NOT required if this is "called as a procedure", though the semaphore component of the argument can be used (though it is less likely to be useful) The queue component is REQUIRED if this is "called as a daemon"; the semaphore component is not required, but if available it will be used (it makes much more sense to use the semaphore when calling as a deamon). The syntax of qs_info must be: Queue_name' 'semaphore_name For example (without the " quotes): "QCH_0193 /SEM32/SRE2002_32_02" The optional semaphore_name should be an open and posted 32 bit semaphore. This procedure will poll this semaphore before all socket calls, and if is NOT posted (or if it has been closed), then an immediate timeout is forced. If you do not include a semaphore_name, then you won't be able to force a timeout. Otherwise, everything else will work (the polling will NOT occur). trnid : Not required if a procedure call. a transaction id. Should be unique to this transaction -- it's used to identify the response when returned through a queue. Therefore, it is not needed when calling this as a procedure. sock_time : max time PER SOCKET REQUEST. MUST be less then tottime. If not specified, sock_time_def is used Returns: If no error: When called as a procedure: 0' 'ip_address||'0d0a'x||stuff When called as daemon, get_url_adv will push transaction||' 0 '||ip_address||'0d0a'x||stuff to queue_name stuff is response from the server If error: If maxlen, use 6 as error, but also send maxlen (more or less) bytes in STUFF Else Procedure call: return the status-code Daemon call: push and transaction||' '||status_code||' '||error_message Error codes: 0 = no error 1 = ioctl error 2 = connection error 3 = problem encountered in sockrecv 4 = sockgethostbyname error 5 = coding error in this procedure 6 = maxlen hit -- so only first maxlen characters will be returned Examples: Call as procedure: moo=get_url_adv (50,'GET','foo.bar.net,125.22.251.2','/abc/alpha.htm','mysite.net',, 'myname mypwd',,1,,,20) as procedure, using a proxy: oo=get_url_adv(50,'DNS','proxy.wow.com') parse var oo . aproxy '0d0a'x . oo=get_url_adv(50,'GET','foo.bar.net,,'||aproxy,'/abc/alpha.htm','mysite.net',, 'myname mypwd',,1,,,20) */ iserr: say "error at line "sigl ',' rc exit