/* 1 Nov 2002 : GRABSITE Ver 1.2
This will take an html document and find all
and def_tofile='index.htm'
then the contents of this url would be written to:
destination_dir\sports\scoreboard\index.htm */
def_tofile='INDEX.HTM'
/*If HTML Document mode is selected, then
only links ending with these extensions are downloaded, examined, and written.
Notes:
* In all cases, if the content-type header is NOT text/html,
the contents will NOT be examined.
* If HTMLEXTS='', then this test is not performed
* If NOT_HTMLEXTS='', then this test is not performed */
htmlexts='SHTML SHT HTM HTML HTML-SSI HTM-SSI'
/* log file. If none desired, set=0. Otherwise, enter a
filename. Note that old log files will be deleted/overwritten */
logfile='GRABSITE.LOG'
/* maximum time allowed to get a resouce, in seconds */
max_time=40
/* nocgi=1 to skip CGI urls (that have a /CGI in their path */
nocgi=1
/* space delimited list of strings that signfies "this is a
CGI, or other protocol, script". If left blank, /CGI is used.
Notes:
* if ANY of these strings occur anywhere in the url, the url
will NOT be retriered
* a case insensitive comparison is used */
nocgi_strings='/CGI'
/* nosearch=1 to skip urls that end with ?xxx
(where xxx is a string of any length)*/
nosearch=1
/*If HTML Document mode is selected, then
links ending with these extensions are NOT downloaded.
Notes:
* If "retrieve all links" mode is specified, then
not_htmlexts is ignored
* If HTMLEXTS<>'', then this test is not performed
* If "retrieve all links" mode is specified, then
htmlexts is ignored
* If HTMLEXTS='', then this test is not performed */
not_htmlexts='JPG GIF BMP ZIP GZ TIF TIFF MOV AU EXE COM WAV XBM PDF PS EPS '
/* overwrite=1 means "overwrite preexisting files.
Otherwise, don't overwrite
*/
overwrite=1
/* optional request header(s) to send to servers
Note: use '0d0a'x to seperate multiple request headers */
reqheaders='User-agent: GrabSite'
/* if URL's path starts with remove_prefix, then trim
the beginning of the path (remove everything up to the first /)
For example,
if remove_prefix='!RANGE
and a link is /!RANGE:bytes=100-200/surplus/prices.lst
then /surplus/prices.lst is used */
remove_prefix='!RANGE'
/* If robot_check=1, then check for a /ROBOTS.TXT file. This contains
instructions on what paths should not be visited by "web robots".
*/
robot_check=1
/* if URL's path starts with skip_prefix, then skip it
This is only needed when the "retrieve" test is /. */
skip_prefix='!'
/* Status reports:
-2 for NO status output, -1 for minimal, 0 for average
1 for some, 2 for too much */
verbose=2
/********** END USER changeable parameters ***********/
parse arg afile destdir includer includer2 write_all
cmdline=0
if afile<>'' then do
afile=translate(afile,'/','\')
cmdline=1
end /* do */
write_all0=write_all
if afile='?' then do
say "GrabSite -- GET a linked set of pages from the WWW"
say
say "Calling syntax: GrabSite URL DestDir Test1 Test2 Get_all "
say " where:"
say " URL = a fully qualified URL (the home page to start at)"
say " DestDir = destination directory (on local disk) to write results to"
say " Test1 = only parse documents in/under this prefix "
say " Test2 = only retrieve documents in/under this prefix "
say " Get_all = if 0, then do NOT get non-html documents "
say " "
say " *** Note: to avoid command line problems: use \ instead of / ***"
say " "
say "Example: "
say " D:>grabsite http:\\fu.br.net\circ\index.htm d:\foob \circ\ \ 1 "
say
say "Or .. enter without arguments for user prompts"
exit
end /* do */
/* initialize some stuff */
baseurl=''
rootdir=''
include_depth=0
depth_also='A'
includer=translate(translate(includer,'/','\'))
includer2=translate(translate(includer2,'/','\'))
remove_prefix=translate(remove_prefix)
skip_prefix=translate(skip_prefix)
htmlexts=translate(htmlexts)
not_htmlexts=translate(not_htmlexts)
ndeleted=0
nwritten=0 ; noconnects=0
ngets=0; n400s=0
nparsed=0
if nocgi_strings='' then nocgi_strings='/CGI'
crlf='0d0a'x
fileurls.0=0
flist.0=0
call loaddll /* load some dlls, set some parameters */
signal on error name iserr ; signal on syntax name iserr
say
say " "cy_ye"GrabSite -- GET a set of linked documents from a WWW site"normal
say
if logfile=0 | logfile=' ' then do
logfile=0
end /* do */
else do
aa=stream(logfile,'c','query exists')
if aa<>'' then do
foo=sysfiledelete(logfile)
if verbose>0 then say "Old logfile deleted: "logfile
call lineout logfile,'GrabSite log file. Created '||time('n')||' '||date('n')
end /* do */
end
/***** determine file/url to read, and other info */
say
jump1: nop
if afile='' then do
afile=getstring("Home page to grab, or enter ? for a brief description.",'?',reverse' 1)'normal)
if afile='?' then do
call helpme1
afile=''
signal jump1
end
end
afile=strip(afile)
afileu=translate(strip(afile))
if abbrev(afileu,'FILE://') then do
parse var afile . '://' afile
afileu=translate(afile)
afile_isurl=0
if stream(afile,'c','query exists')="" then do
call printsay "No such file: "afile
exit
end /* do */
fgoo=printsay( " ... reading "||cutstrg(afile,50) "....", ,
" ... reading "||afile|| "....")
stuff=charin(afile,1,chars(afile))
afile_isurl=0
if baseurl='' then baseurl=getstring("Default site (the dotted ip address)",defbaseurl,reverse' 1a)'normal)
if rootdir='' then rootdir=getstring("Default 'root' directory ",defrootdir,reverse' 1b)'normal)
end /* local file as base */
else do /* it's a url */
if abbrev(afileu,'HTTP://')<>1 then do
afile='http://'afile
afileu=translate(afile)
end /* do */
afile_isurl=1
parse var afile . '://' bb1 '/' bb2
baseurl=bb1
ii=lastpos('/',bb2)
if ii=0 then
rootdir='/'
else
rootdir=left(bb2,ii)
end /* url entry */
if pos('://',baseurl)=0 then baseurl='http://'||baseurl
rootdir=strip(rootdir)
if rootdir<>'/' then rootdir='/'||strip(rootdir,,'/')||'/'
baseurl=strip(strip(baseurl,'t','/'))
/* destination directory */
atdestdir: nop
if destdir='' then do
destdir=getstring("Enter a destination directory ",directory(),reverse' 2)'normal)
if destdir="?" then do
call helpme1
destdir=''
signal atdestdir
end
destdir=strip(destdir)
if substr(destdir,2,1)<>':' & abbrev(destdir,'\')=0 then do
dd=directory()
if destdir='' then
destdir=strip(dd,,'\')
else
destdir=strip(dd,,'\')||'\'||destdir
end
didit=sysmkdir2(destdir,1)
if didit<>0 then do
say "Could not access, or create, "destdir
exit
end
end
else do
destdir=strip(strip(destdir),'t','\')'\'
didit=sysmkdir2(destdir,1)
if didit<>0 then do
say "Could not access, or create, "destdir
exit
end /* do */
end
destdir=strip(strip(destdir),'t','\')'\'
/* get and set includers variables */
call get_includers
/* Quick/skeleton mode */
getquick:nop
if cmdline<>1 then do
do until write_all<>''
al=getstring(' HTML documents only (Yes, No, or ? for help)','N',reverse' 5)'normal)
al=strip(translate(al))
if al='?' then do
al=''
call help_writeall
iterate
end
if abbrev(al,'N')=1 then
write_all=1
else
write_all=0
end
if write_all=0 then
call printsay "Ignoring non-html documents"
else
call printsay "Retrieving all links "
say
/* modify other parameters */
if write_all0='' then do
if yesno(" Would you like to modify configuration parameters?")=1 then do
call modify_config
end /* do */
end
end
/************** Done with user input **********/
/******* copy file/url to destdir */
/* if local file, copy directly to destidr
if url, then maybe copy relative to destdir
*/
if afile_isurl=0 then do /* local file -- jump start*/
ff=translate(afile,' ','\/')
ff2=word(ff,words(ff))
ff2=destdir||ff2
say bold"Saving to "normal|| ff2
foo=translate(stream(ff2,'c','open write'))
if foo<>'READY:' then do
say "Could not open file for writing. Error was: " foo
exit
end /* do */
foo=charout(ff2,stuff,1)
if foo<>0 then do
say "Error. Problem writing file "
exit
end /* do */
foo=stream(ff2,'c','close')
foo=urls_in(stuff,baseurl,rootdir,afile,1)
if verbose>0 then do
foo=printsay(" ... done parsing "||cutstrg(afile,50),,
" ... done parsing "afile)
call printsay ' '
call printsay " " cy_ye " # links in "normal||bold||afile"="normal||" "||fileurls.0
end
nparsed=1
end
else do /* a url */
iurls=1
taref=strip(translate(afile))
if length(taref)<30 then do
uaref=taref
end
else do
p2=right(taref,30)
p1=c2x(stringcrc(taref))
uaref=p1||'_'||p2
end
/* uaref=translate(strip(afile)) */
flist.uaref=1
flist.0=1
fileurls.iurls=afile
fileurls.iurls.!ref=' (user specified)'
fileurls.iurls.!id=aref
fileurls.0=iurls
end
fileurls.1.!depth=0
/**** get a robot.txt file first? */
if robot_check=1 then do
aurl=baseurl'/robots.txt'
rlist=get_url(aurl)
exclist=add_robot(rlist)
if verbose>0 then do
call printsay "Excluding: "exclist ; call printsay ' '
end
exclist.0=0
if exclist<>'' then do
do ii=1 to words(exclist)
exclist.ii=translate(strip(word(exclist,ii)))
end /* do */
exclist.0=words(exclist)
end /* do */
end /* build exclist. */
/************ Get urls in first file/url */
call printsay ' '
if write_all=0 then do
if length(afile)<40 then do
call printsay ' Examining html links starting from:'||bold||afile||normal
end
else do
call printsay ' Examining html links starting from...'
call printsay ' :'||bold||afile||normal
end
end
else do
if length(afile)<40 then do
call printsay ' Examining links starting from:'||bold||afile||normal
end
else do
call printsay ' Examining links starting from...'
call printsay ' :'||bold||afile||normal
end
end
call printsay ' '
/********** now get the urls, parse, add to list.... */
mm=0
do forever
mm=mm+1
if mm>fileurls.0 then leave
goob=fileurls.mm
f1f_orig=goob
f1f=goob
isdepth=fileurls.mm.!DEPTH /* the depth of the resource (That might be parsed */
if length(f1f)>40 then f1f='...'right(goob,36)
oof=''
if verbose>0 then oof=' -- '||filespec('n',fileurls.mm.!ref)
parse var goob . '://' bb1 '/' asel
aselorig=asel
/* check asel for ../ constructions, and fix if found */
do forever
if pos('/../',asel)=0 then leave /* no ../ to remove */
parse var asel p1 '/../' p2
ip0=lastpos('/',p1)
if ip0=0 then do
asel='..'
leave /* i.e.; http://foo.bar.net/../ are disallowed */
end /* do */
asel=left(p1,ip0)||p2
end /* do */
if asel='..' then do /* .. signals an error */
if verbose>1 then
foog=printsay( "Skipping #"bold||mm||normal||" (too many /../) "||f1f||oof,,
"Skipping #"bold||mm||normal||" (too many /../) "||f1f_orig||' '||oof)
iterate
end
if asel<>aselorig & verbose>1 then call printsay "... using " asel
goob2=translate('HTTP://'||bb1||'/'||asel)
baseurl=bb1
ii=lastpos('/',asel)
if ii=0 then
rootdir='/'
else
rootdir=left(asel,ii)
if pos('://',baseurl)=0 then baseurl='http://'||baseurl
rootdir=strip(rootdir)
if rootdir<>'/' then rootdir='/'||strip(rootdir,,'/')||'/'
baseurl=strip(strip(baseurl,'t','/'))
if robot_no(asel)=1 then do
if verbose>1 then
foog=printsay( "Skipping #"bold||mm||normal||" (robot exclusion) "||f1f||oof ,,
"Skipping #"bold||mm||normal||" (robot exclusion) "||f1f_orig||oof)
iterate
end
if includer2<>"" then do /* only GET if in/under this directory */
if abbrev(goob2,includer2)=0 then do
if verbose>1 then
foog=printsay("Skipping #"bold||mm||normal||" (not in dir) "||f1f||oof,,
"Skipping #"bold||mm||normal||" (not in dir) "||f1f_orig||oof)
iterate
end
end
if nocgi=1 then do /* cgi? then skip */
ccok=1
do kk=1 to words(nocgi_strings)
if ccok=0 then leave
acc=strip(translate(word(nocgi_strings,kk)))
gloop=pos(acc,translate('/'asel))
if gloop>0 then do
ccok=0
end
end
if ccok=0 then do
if verbose>1 then
foog=printsay("Skipping #"bold||mm||normal||" (CGI) "||f1f||oof ,,
"Skipping #"bold||mm||normal||" (CGI) "||f1f_orig||oof )
iterate
end
end
if nosearch=1 then do /* skip "search string" calls (usually to scripts*/
if pos('?',asel)>0 then do
if verbose>1 then
goog=printsay("Skipping #"bold||mm||normal||" (contains ?) "||f1f||oof,,
"Skipping #"bold||mm||normal||" (contains ?) "||f1f_orig||oof)
iterate
end
end
if skip_prefix<>'' then do /* ignore if starts with this? */
if abbrev(asel,skip_prefix)=1 then do
if verbose>1 then
goof=printsay("Skipping #"bold||mm||normal||" (contains "skip_Prefix") "||f1f||oof,,
"Skipping #"bold||mm||normal||" (contains "skip_Prefix") "||f1f_orig||oof)
iterate
end
end
ara=lastpos('.',asel);anext=''
if ara>0 then do /* check for html type of extentsion*/
anext=translate(strip(substr(asel,ara+1)))
end
if htmlexts<>'' & write_all<>1 then do /* only get possible htmls */
if pos(anext,htmlexts)=0 then iterate
end
if not_htmlexts<>"" & write_all<>1 then do /* don't get almost certainly NOT htmls */
if pos(anext,not_htmlexts)>0 then iterate
end
if verbose>-1 then
foog=printsay("Checking "bold||mm||normal||" of "fileurls.0")"||f1f||oof,,
"Checking "bold||mm||normal||" of "fileurls.0")"||f1f_orig||oof)
/* get the url */
stuff=get_url(goob,,verbose,reqheaders)
if stuff<>'' then do
goof=printsay(" .... done GETting "||cutstrg(goob,50),,
" .... done GETting "||goob)
end
else do
if is_timeout=1 then do
goof=printsay("Timeout on #"bold||mm||normal||f1f||oof,,
"Timeout on #"bold||mm||normal||f1f_orig||oof)
noconnects=noconnects+1
iterate
end
else do
goof=printsay("Skipping #"bold||mm||normal||" (problem GETting) "||f1f||oof,,
"Skipping #"bold||mm||normal||" (problem GETting) "||f1f_orig||oof)
noconnects=noconnects+1
iterate
end
end
ngets=ngets+1
call extracts /* extract body and head */
/* look for return code */
parse var response_line . icode .
r1=left(response_code,1)
if r1=4 | r1=5 | r1=1 then do /* error response */
n400s=n400s+1
iterate
end
/* get the content-type */
ss='!CONTENT-TYPE'
if translate(headers.ss)<>'TEXT/HTML' then do /* not html -- don't parse */
if write_all=1 then call url_to_file goob2 /* but possibly save to disk */
iterate /* don't bother parsing this */
end
/* does it satisfy the INCLUDER test? */
ii=use_me(goob2,isdepth)
if ii=0 then do
call url_to_file goob2
iterate /* don't bother parsing this */
end
/* extract links, but first write it to disk */
call url_to_file goob2
if result=0 then iterate
/* if here, extract urls and add to list */
eek=fileurls.0
if verbose>0 then
foof=printsay(" .... parsing "||cutstrg(goob,50),,
" .... parsing "||goob)
if r1=3 then do /* redirect -- extract location header */
ss='!LOCATION'
asd=strip(headers.ss)
if asd<>'' then do
stuff=stuff||' ' /* convert location header to link (a small hack */
end
end
foo=urls_in(stuff,baseurl,rootdir,goob,isdepth+1)
if verbose>0 then
foog=printsay(" ... done parsing "||cutstrg(goob,50),,
" ... done parsing "||goob)
nparsed=nparsed+1
if verbose>1 then do
if eek0 then call printsay " new links to check: "bold||(fileurls.0-eek)||normal
end
end /* ******* Read a url */
/**** Status info */
call printsay ' '
call printsay ' ------- Status: '
call printsay "Total number of unique URLs: "fileurls.0
call printsay "Total number retrieval attempts: " ngets '(400s='n400s'. No Connect='noconnects')'
call printsay "Total number of parsed pages: "nparsed
call printsay "Total number of files written: " nwritten '(files deleted='ndeleted')'
call printsay " "
call printsay "Reminder: files are written to "bold||destdir||normal
if logfile<>0 then do
say ' ** The log file is: ' logfile
call lineout logfile
end
exit
/********************/
/* test whether to parse this. Uses INCLUDER, DEPTH_ALSO, and INCLUDE_DEPTH */
use_me:procedure expose includer depth_also include_Depth
parse arg goob2,mydepth
/* depth, if enabled, can be sufficient */
if include_depth>0 then do
if mydepth<=include_depth then return 1
end
/* depth not satisfied. Check INCLUDER (in/under) test? */
if depth_also=0 then return 0
if includer='' then return 0
if abbrev(goob2,includer)=0 then return 0
return 1
/********/
/* modify configuration parameters */
modify_config:
params="def_tofile htmlexts logfile not_Htmlexts overwrite robot_check reqheaders "
params=params||"reqheaders verbose nocgi nocgi_strings nosearch remove_prefix skip_prefix"
params=translate(params)
say
do forever
aa=getstring("Select a parameter to modify (?=list,??=current values, X=done)","?",reverse" -->"normal)
if aa="?" then do
say
say " "reverse"Configuration Parameters: "normal
say bold" DEF_TOFILE"normal"= default filename, used when a URL does not contain a filename"
say bold" HTMLEXTS"normal"= HTML extensions (if quick mode selected, only files with these "
say " extensions are retrieved)"
say bold" LOGFILE"normal"= Name of logfile (results are recorded here)"
say bold" MAX_TIME"normal"= maximum time allowed to get a resouce, in seconds "
say bold" NOCGI"normal"= If 1, do NOT retrieve URLs containing a NOCGI_STRINGS"
say bold" NOCGI_STRINGS"normal"= List of strings that signify CGI script, or other types"
say " of URLs to skip"
say bold" NOSEARCH"normal"= If 1, do NOT retrieve URLs that end with a ?xxxx "
say bold"NOT_HTMLEXTS"normal"= non-HTML extensions (if quick mode selected, files with these "
say " extensions are ignored)"
say bold" OVERWRITE"normal"= If 1, then overwrite preexisting files "
say bold"REMOVE_PREFIX"normal"= If the URL's path starts with this, then trim the "
say " beginning of the path (remove everything up to the first /) "
say bold" SKIP_PREFIX"normal"= If the URL's path starts with this, then skip it "
say bold" VERBOSE"normal"= If 1, verbose mode "
say
iterate
end /* do */
if aa="??" then do
say
say " "reverse"Current values of configuration Parameters: "normal
say bold" DEF_TOFILE"normal"= "def_tofile
say bold" HTMLEXTS"normal"= "htmlexts
say bold" LOGFILE"normal"= "logfile
say bold" MAX_TIME"normal"= "max_time
say bold" NOCGI"normal"= "nocgi
say bold" NOCGI_STRINGS"normal"= "nocgi_STRINGS
say bold" NOSEARCH"normal"= "nosearch
say bold"NOT_HTMLEXTS"normal"= "not_htmlexts
say bold" OVERWRITE"normal"= "overwrite
say bold"REMOVE_PREFIX"normal"= "remove_Prefix
say bold" REQHEADERS"normal"= "reqheaders
say bold" ROBOT_CHECK"normal"= "robot_check
say bold" SKIP_PREFIX"normal"= "skip_prefix
say bold" VERBOSE"normal"= "verbose
say
say "Note: you can permanently change these values by editing GRABSITE.CMD"
say
iterate
end /* do */
aa=translate(strip(aa))
if aa='X' then leave
if wordpos(aa,params)=0 then do
say "No such parameter: " aa
end /* do */
else do
aaold=value(aa)
bb=getstring("Enter new value for "aa,aaold,bold" --->"normal)
foo=value(aa,bb)
end
end
return 0
/********/
/* get and set includer and includers2 */
get_includers:
include2: nop
if includer2='' then do
includer2=getstring(" Only GET (& save) urls that begin with ",'/',reverse' 3)'normal)
end
if includer2="?" then do
call help_includer
includer2=''
signal include2
end
includer2=translate(includer2)
if includer2='' then
includer2=baseurl||'/'
else
includer2=baseurl||'/'||strip(includer2,'l','/')
include1: nop
if includer='' then do
includer=getstring(" Only GET & process urls in or under ",rootdir,reverse' 4a)'normal)
end
if includer="?" then do
call help_includer
includer=''
signal include1
end
includer=translate(includer)
include1a: nop
include_depth=getstring(' GET & process urls at this DEPTH (from base-URL) ',,
include_depth,reverse' 4b)'normal)
if include_depth="?" then do
call help_includedepth
include_depth=0
signal include1a
end
if datatype(include_depth)<>'NUM' then do
say "The DEPTH parameter must be an integer "
signal include1a
end
include1b: nop
if include_Depth<>0 then do
do forever
depth_also=getstring('use in addition to (A), or instead (N), of the URLS in or under test',,
depth_also,reverse' 4b.i)'normal)
depth_also=translate(strip(depth_also))
if depth_also="?" then do
call help_includedepth
depth_also='A'
iterate
end
if abbrev(depth_also,'N')=1 then
depth_also=0
else
depth_also=1
leave
end
end
if includer='' then
includer=baseurl||rootdir
else
includer=baseurl||'/'strip(includer,'l','/')
say
includer=translate(includer)
includer2=translate(includer2)
if abbrev(translate(includer),translate(includer2))=0 then do
call printsay "Error: processed URLS are NOT a subset of GET urls "
signal include2
end
if length(includer2)<50 then do
call printsay "Only GETting URLs in/under: "includer2
end
else do
call printsay "Only GETting URLs in/under ... "
call printsay " : "includer2
end
if include_depth>0 then do
call printsay "Process URLS within "include_depth" steps of base-URL"
end
if depth_also=1 | include_depth=0 then do
if length(includer)<50 then do
if include_depth=0 then
call printsay "Process URLs in/under: "includer
else
call printsay " also process URLs in/under: "includer
end
else do
if include_depth=0 then do
call printsay "Process URLs in/under... "
call printsay " : "includer
end
else do
call printsay " also process URLs in/under... "
call printsay " : "includer
end
end
end
len_includer2=length(includer2)
say
return 0
/**************************************************/
/* copy a url to a file */
url_to_file:
parse arg afil
goob2=translate(afil)
if includer2<>"" then do /*relative to includer2 directory */
tofile=substr(goob2,len_includer2)
end
else do
parse var afil . '://' . '/' tofile
end /* do */
if tofile='' then do
call printsay " ERROR: could not write " afil
call printsay " ("goob")"
return 0
end /* do */
if tofile='' | right(tofile,1)='/' then tofile=tofile||def_tofile
/* save to destidr */
tofile=translate(tofile,'\','/')
tofile=strip(strip(tofile),'l','\')
tofile2=destdir||tofile
todir=filespec('d',tofile2)||filespec('p',tofile2)
mkit=sysmkdir2(todir)
yow=stream(tofile2,'c','query exists')
if yow<>'' then do
if overwrite=2 then do
if verbose>-1 then call printsay " "||cy_ye||tofile2||normal " old version used."
return 1 /* use old copy */
end
if overwrite=1 then do
if verbose>0 then call printsay " .... deleting "tofile2
foo=sysfiledelete(tofile2)
ndeleted=ndeleted+1
end
else do
call printsay " > "tofile2 " exists; "bold"skipping "normal
return 0
end
end
foo=stream(tofile2,'c','open write')
wow=charout(tofile2,stuff,1)
if wow<>0 then do
call printsay " ERROR: could not write " tofile2
call printsay " ("goob")"
return 0
end
foo=stream(tofile2,'c','close')
if foo="READY:" then do
if verbose>-2 then
call printsay " "||cy_ye||tofile2||normal||stream(tofile2,'c','query size')||" bytes written."
end
nwritten=nwritten+1
return 1 /* sets globals */
/********************/
/* search a file, find IMG SRC= and A HREF= urls. Add BASEURL if
no / or http://.../ at beginning of URL */
urls_in:procedure expose fileurls. flist. remove_prefix bold normal logfile reverse cy_ye
parse arg stuff, baseurl,rootdir,stuffname,isdepth
/* remove comments */
body=""
do forever /*no comments within comments are allowed */
if stuff="" then leave
parse var stuff t1 '' stuff
body=body||t1
end /* do */
stuff=body
body=''
if verbose=1 then call printsay "Parsing "||length(stuff)||' characters'
/* find all IMG SRC= and A HREF=, FRAME= throw away internal links */
do until stuff=""
parse var stuff . '<' anarg '>' stuff
aref=afindsrc(anarg)
if aref='' then iterate
taref=translate(aref)
if abbrev(taref,'MAILTO:')=1 then iterate /* only keep http */
if abbrev(taref,'FTP:')=1 then iterate
if abbrev(taref,'GOPHER:')=1 then iterate
/* fix up name to be fully qualified url */
select
when abbrev(taref,'HTTP://')=1 then nop
when abbrev(taref,'/')=1 then aref=baseurl||aref
otherwise aref=baseurl||rootdir||aref
end
/* check for remove_prefix entries */
if remove_prefix<>'' then do
parse var aref a1 '://' a2 '/' aaurl
if abbrev(translate(aaurl),translate(remove_prefix))=1 then do
parse var aaurl . '/' aaurl
aref=a1'://'a2'/'aaurl
if verbose=1 then call printsay " > " remove_prefix "removal yields: "aref
end
end
/* record this entry only if not yet recorded -- else, just increment counter */
taref=strip(translate(aref))
if length(taref)<30 then do
uaref=taref
end
else do
p2=right(taref,30)
p1=c2x(stringcrc(taref))
uaref=p1||'_'||p2
end
if datatype(flist.uaref)<>'NUM' then flist.uaref=0
flist.uaref=1+flist.uaref
flist.0=flist.0+1
if flist.uaref=1 then do
iurls=fileurls.0+1
fileurls.iurls=aref
fileurls.iurls.!ref=stuffname
fileurls.iurls.!depth=isdepth
fileurls.iurls.!id=uaref
fileurls.0=iurls
end
end
return iurls
/*****************/
/* get a string from user */
getstring:procedure expose normal bold reverse logfile cy_ye
parse arg prompt,def,prompt0
abold=bold
if bold="BOLD" then abold=''
anormal=normal
if normal='NORMAL' then anormal=''
l1=length(prompt)
l2=length(def)
if l1+l2>38 then do
say prompt0' 'abold||prompt||anormal
if l2>22 then do
say ' (ENTER='abold||def||normal')'
call charout, bold" ? "normal
parse pull ans
end /* do */
else do
call charout,' (ENTER='abold||def||anormal')? '
parse pull ans
end
end
else do
call charout,prompt0' 'bold||prompt||normal' (ENTER='abold||def||anormal')? '
parse pull ans
end
if ans='' then ans=def
return ans
/* --- Load the function library, if necessary --- */
loaddll:
if RxFuncQuery("SockLoadFuncs")=1 then do /* already there */
call RxFuncAdd "SockLoadFuncs","rxSock","SockLoadFuncs"
call SockLoadFuncs
end
foo=rxfuncquery('sysloadfuncs')
if foo=1 then do
call RxFuncAdd 'SysLoadFuncs', 'RexxUtil', 'SysLoadFuncs'
call SysLoadFuncs
end
/****
foo=rxfuncquery('rexxlibregister')
if foo=1 then do
call rxfuncadd 'rexxlibregister','rexxlib', 'rexxlibregister'
call rexxlibregister
end
foo=rxfuncquery('rexxlibregister')
if foo=1 then do
say " Could not find REXXLIB "
exit
end
***/
ansion=checkansi()
if ansion=1 then do
aesc='1B'x
cy_ye=aesc||'[37;46;m'
normal=aesc||'[0;m'
bold=aesc||'[1;m'
re_wh=aesc||'[31;47;m'
reverse=aesc||'[7;m'
end
else do
say " Warning: Could not detect ANSI.... output will look ugly ! "
cy_ye="" ; normal="" ; bold="" ;re_wh="" ;
reverse=""
end /* Do */
return 1
/* -------------------- */
/* get a yes or no , return 1 if yes */
yesno:procedure expose normal reverse bold logfile cy_ye
parse arg fooa , allopt,altans
if altans<>" " & words(altans)>1 then do
w1=strip(word(altans,1))
w2=strip(word(altans,2))
a1=left(w1,1) ; a2=left(w2,1)
a1a=substr(w1,2) ; a2a=substr(w2,2)
end
else do
a1='Y' ; a1a='es'
a2='N' ; a2a='o'
end /* Do */
ayn=' '||bold||a1||normal||a1a||'\'||bold||a2||normal||a2a
if allopt=1 then ayn=ayn||'\'||bold||'A'||normal||'ll'
do forever
foo1=normal||reverse||fooa||normal||ayn
call charout, foo1 normal ':'
pull anans
if abbrev(anans,a1)=1 then return 1
if abbrev(anans,a2)=1 then return 0
if allopt=1 & abbrev(anans,'A')=1 then return 2
end
nocon:
if rc=-7 then return 0
exit 0
/* ------------------------------------------------------------------ */
/* function: Check if ANSI is activated */
CheckAnsi: PROCEDURE
thisRC = -1
trace off
/* install a local error handler */
SIGNAL ON ERROR Name InitAnsiEnd
"@ANSI 2>NUL | rxqueue 2>NUL"
thisRC = 0
do while queued() <> 0
queueLine = lineIN( "QUEUE:" )
if pos( " on.", queueLine ) <> 0 | , /* USA */
pos( " (ON).", queueLine ) <> 0 then /* GER */
thisRC = 1
end /* do while queued() <> 0 */
InitAnsiEnd:
signal off error
RETURN thisRC
/*************************/
/* return 1 if adir is an existing (possibly empty) directory , 0 if not */
dosisdir2:procedure
parse arg adir
adir=strip(adir)
adir=strip(adir,'t','\')
nowdir=directory()
nowdrive=filespec('d',nowdir'\')
nowpath=filespec('p',nowdir'\')
adr=filespec('d',adir)
if adr='' then do
if abbrev(adir,'\')=0 then
adir=nowdrive||nowpath||adir
else
adir=nowdrive||adir
end /* do */
foo=sysfiletree(adir,goo,'D')
if goo.0>0 then return 1
return 0
/*************************************/
/* parse GETten stuff to globals
response_line = the response line
response_code = the 200, 401, etc. code
headers. = list of response headers
stuff = the contents (the file)
*/
extracts:
cr='0a'x
parse var stuff response_line (cr) stuff
parse var response_line . response_code .
response_line=strip(response_line,,'0d'x)
headers.0=''
do forever
parse var stuff ahead (cr) stuff
ahead=strip(ahead,,'0d'x)
if ahead='' then leave
parse var ahead name ':' aval
nn=translate('!'||name)
headers.0=headers.0' 'nn
headers.nn=aval
end /* do */
/* remove html comments */
return 1
/* ------------- */
/* create a directory, arbitrarily deep.
Returns 0 if succes, otherwise returns an error code
adir: directory to create -- must be fully qualified.
verbose: if 1, will write some status stuff to screen
*/
sysmkdir2:procedure
parse arg adir,verbose
adir=strip(adir,'t','\')
if dosisdir2(adir)=1 then do /* already exists */
if verbose=1 then say " Using pre-existing directory: "adir
return 0
end /* do */
ff=sysmkdir(adir)
if ff=0 then return ff
/* make the tree */
f2=adir'\'
dd=filespec('d',f2)
pp=filespec('p',f2)
if pp='\' | pp='' then return -1
pp2=strip(translate(pp,' ','\'))
do mm=1 to words(pp2)
a1=subword(pp2,1,mm)
a1=translate(a1,'\',' ')
dd2=dd'\'a1
hoo=sysmkdir(dd2)
if hoo=0 & verbose=1 then call printsay ' ... creating: 'dd2
end /* do */
return hoo
/****************/
/* URL and DESTDIR help info */
helpme1:
say
say bold"GrabSite"normal" is designed to copy a WWW site to your local hard disk. "
say
say "It's easy to use: just specify a URL, and then specify a directory"
say "on your hard drive to copy the web pages (and other files) retrieved"
say "from this WWW site."
say
say "For example: suppose the 'home page' is"
say " www.coolstuff.org/games/expert.htm"
say "and the 'destination directory' is:"
say " d:\localweb\game10 "
say "Then..."
say " a) GrabSite will GET (using socket calls) the /games/expert.htm HTML "
say " document at www.coolstuff.org."
say " b) A copy of /games/expert.htm will be written to d:\localweb\games10 "
say " c) /games/expert.htm will be scanned for links "
say " d) For each link found, repeat step a (changing names appropriately)"
say
say "Note: For hints on running from command line, run GrabSite with a ? argument."
say" Example: D:>GrabSite ? "
say
call charout,reverse"Hit any key to continue "normal
foo=sysgetkey('noecho')
say
return 1
/****************/
/* INCLUDER help info */
help_Includer:
say
say "You can, and should, limit the scope of "bold"GrabSite"normal"'s WWW downloads"
say "(If you don't, you could end up downloading a significant chunk of the WWW!)"
say
say "There are two several used to limit scope: "
say
say " a) Limiting what URLS are "bold"downloaded"normal", but "bold"not"normal" examined."
say " URLS that pass this test are downloaded (and saved to disk)."
say " They are "bold"not"normal" parsed -- links they may contain are ignored."
say
say " b) Limiting what URLS are "bold"downloaded"normal" and "bold"examined"normal"."
say ' URLS that pass test a), and that have a content-type of text/html, and '
say " that ALSO pass this test are retrieved, saved to disk, and parsed -- "
say " '' then
say "Only retrieve links ending with: "htmlexts
else
say "Retrieve links that do NOT end with: "not_htmlexts
say
say cy_ye" Note: Configuration hint:"normal
say " You can modify this rule by changing the HTMLEXTS and NOT_HTMLEXTS parameters"
say
call charout,reverse"Hit any key to continue "normal
foo=sysgetkey('noecho')
say
return 1
/***************/
/* cut length of string to nn characters, if necessary */
cutstrg:procedure
parse arg astr,ilen
if ilen='' then return astr
if length(astr)0 then call lineout logfile,aval2
return 0
/***********************************/
/* search a file, find IMG SRC=, FRAME SRC=, and A HREF= urls. Add BASEURL if
no / or http://.../ at beginning of URL
Return results in hrefs. and imgs. */
afindsrc:procedure
parse arg anarg
parse var anarg htype stuff
htype=translate(strip(htype))
anarg=translate(anarg,' ','0d0a0900'x)
/* find all FRAME SRC=, IMG SRC= and A HREF=, throw away internal links */
chklist='BODY IMG A FRAME AREA EMBED LINK APPLET '
anctype=wordpos(htype,chklist)
if anctype=0 then return '' /* not a url containing element */
/* depending on anctye, look for different things */
select
when anctype=1 then do /* body background */
do forever
if anarg='' then return '' /* nothing found */
parse var anarg a1 anarg ; a1=strip(a1)
if abbrev(translate(a1),'BACKGROUND=')=0 then iterate
parse var a1 . '=' gotimg . ; gotimg=strip(strip(gotimg),,'"')
return gotimg
end /* do */
end /* i3>0 */
when anctype=2 then do /* img */
do forever
if anarg='' then return ''
parse var anarg a1 anarg ; a1=strip(a1)
if abbrev(translate(a1),'SRC=')=0 then iterate
parse var a1 . '=' gotimg . ; gotimg=strip(strip(gotimg),,'"')
return gotimg
end /* do */
end
when anctype=3 | anctype=5 | anctype=7 then do /* A AREA LINK */
do forever
if anarg='' then leave
parse var anarg a1 anarg ; a1=strip(a1)
if abbrev(translate(a1),'HREF=')=0 then iterate
parse var a1 . '=' gothref . ; gothref=strip(strip(gothref),,'"')
parse var gothref gothref '#' . /* toss out internal jumps */
if gothref="" then return ""
if abbrev(translate(gothref),'JAVASCRIPT:') then return "" /* don't do "javascript:" entries */
return gothref
end /* do */
end
when anctype=4 | anctype=6 then do /* FRAME EMBED */
do forever
if anarg='' then leave
parse var anarg a1 anarg ; a1=strip(a1)
if abbrev(translate(a1),'SRC=')=0 then iterate
parse var a1 . '=' gothref . ; gothref=strip(strip(gothref),,'"')
parse var gothref gothref '#' . /* toss out internal jumps */
if gothref="" then return ""
return gothref
end /* do */
end
when anctype=8 then do /* APPLET */
abase=''; aref=''
do forever
if anarg='' then leave
parse var anarg a1 anarg ; a1=strip(a1)
if abbrev(translate(a1),'CODE=') + ,
abbrev(translate(a1),'CODEBASE=')=0 then iterate
if abbrev(translate(a1),'CODEBASE=')=1 then do
parse var a1 '"' abase '"' .
end /* do */
else do /* CODE */
parse var a1 '"' aref '"'
end /* do */
if aref<>'' & abase<>'' then leave
end
if aref='' then return '' /* no CODE= found */
if abase<>'' then aref=strip(abase,'t','/')||'/'||strip(aref,'l','/')
return aref
end
otherwise return ''
end /* select */
return ''
/***********/
/* remove substring */
removestrg:procedure
parse arg aval,astr
if pos(astr,aval)=0 then return aval
aa=''
do forever
if aval='' then leave
parse var aval a1 (astr) aval
aa=aa||a1
end
return aa
/**************/
/******************************/
/* parse a robots.txt file,
The algorithim:
1 ignore # lines (comments)
2a look for user-agent: grabsite lines
2b if none, look for user-agent:* lines
3 if 2a or 2b don't match, then no robot disallows exist
4 otherwise, from the look for disallow lines going starting from
the user-agent line, until the first empty line (use 0a as line delimiter,
and throw away the 0d)
5 add from each disallow: asel to exclusion_list
---------------
# samples robots.txt -- will add cgi-* to exclusion_list
user-agent: mozilla
Disallow: /samples
Disallow: /stuff/
#user-agent: checklink
user-agent:gizmo
disallow:fes/
user-agent:*
disallow:cgi-
---------------
*/
add_robot:procedure expose verbose
parse arg abody
parse var abody . icode .
if left(strip(icode),1)<>2 then return '' /* not 200 code, so no disallows */
cr='0a'x
do forever /* get rid of response header */
if abody='' then return '' /* nothing in body */
parse var abody al1 (cr) abody
al1=strip(al1,,'0d'x)
if al1='' then leave /* found empty line*/
end
nn=0
do forever
if abody='' then leave
parse var abody al1 (cr) abody
al1=strip(al1,,'0d'x)
if al1='#' then iterate
parse var al1 al1a '#' .
nn=nn+1
lins.nn=al1a
end
if nn=0 then return '' /* no entries, return */
lins.0=nn
/* look for GRABSITE, or *, user-agent */
iat=0
do mm=1 to lins.0
al=strip(lins.mm)
if abbrev(translate(al),'USER-AGENT')=0 then iterate
parse var al . ':' dagent ; dagent=translate(strip(dagent))
if abbrev(dagent,'CHECKLINK')=1 then do
iat=mm
leave
end
if dagent='*' then do
iat=mm
end /* do */
end /* do */
exlist2=''
if iat=0 then return ' ' /* no matching user-agent */
do mm=iat+1 to lins.0
al=translate(strip(lins.mm))
if al='' then leave /* blank line signals end of "record" */
if abbrev(al,'DISALLOW')<>1 then iterate
parse var al . ':' dasel ; dasel=strip(dasel)
exlist2=exlist2||' '||strip(dasel,'l','/')
end /* do */
return exlist2
/*******************/
/* compare arg against "robot" exclist. -- return 1 if a match */
robot_No:procedure expose exclist.
parse upper arg asel
asel=strip(asel,'l','/')
do mm=1 to exclist.0
tt=exclist.mm
if abbrev(asel,tt)=1 then return 1
end /* do */
return 0
/* - ------------------------ */
/* front end to get_url_adv. Return '' if some kind of error; else return contents */
get_url:procedure expose max_time logfile bold normal reverse cy_ye is_timeout
parse arg aurl,maxchar,verbose,headers
is_timeout=0
maxsize=maxchar
type='GET'
if abbrev(translate(aurl),'HTTP://')=0 then do
if verbose>0 then call printsay "Error: URL not properly specified (it must begin with HTTP://)"
return ''
end
parse var aurl . '://' server '/' req
mehost=''
upwd=''
mhdrs=headers
qs=''
trn_id=''
sock_time=''
stuff=get_url_adv(max_time,maxsize,type,server,req,mehost,upwd,mhdrs, ,
verbose,qs,trn_Id,sock_time)
parse var stuff astat aip '0d0a'x stuff
/* put timeout message here */
is_timeout=1
if strip(astat)<>0 then return ''
return stuff
/* ==================================== */
/* GET a URL, with timeouts.
See bottom of file for syntax */
get_url_adv:procedure expose logfile bold normal reverse cy_ye
/** ---- begin user configurable parameters ---- */
/* Default Number of seconds wait on a socket.
This is NOT the total time for a complete response,
It's the amount of time to wait for a response from
a single socket call.
It can be overridden by the sock_time argument
*/
sock_time_def=30
/** ---- end user configurable parameters ---- */
parse arg tottime,totsize,type,server,request,mehost,upwd,moreheaders, ,
verbose,qs_info,transaction,sock_time
if tottime='VERSION' then return '1.11d'
if sock_time='' then sock_time=sock_time_def
parse var qs_info myqueue mysem .
myqueue=strip(myqueue) ; mysem=strip(mysem)
stuff=bget_url(type,server,request,upwd)
if myqueue='' then return stuff /* called as proc */
a10=transaction||' '||stuff /* else, called as daemon */
foo=rxqueue('s',myqueue)
queue a10
exit " " /* end thread */
/* jump here if socket is killed or other such error. */
if myqueue='' then return ' ' /* called as proc */
a10=transaction
foo=rxqueue('s',myqueue)
queue a10
iserr2:
exit ""
/************************/
/* this gets a url , and uses the "sockin" procedure to avoid hangs
returns error_code,stuff
error codes:
6 = other error
4 = sockgethostbyname error
1 = ioctl error
2 = connection error
3 = problem encountered in sockrecv
0 = no error
*/
bget_url:procedure expose verbose mehost sock_time totsize tottime mysem myqueue moreheaders ,
logfile bold normal reverse cy_ye
parse arg type,server,request,upwd
parse var server server ',' servern ',' proxy
family ="AF_INET"
crlf="0d0a"x ; maxchar=1000000000
if tottime='' | tottime=0 then tottime=30 /* 30 second default */
if datatype(tottime)<>'NUM' then tottime=30
/****** if no proxy.... */
if proxy='' then do
serv.0addr=' '
if servern<>'' then do
servern=strip(servern)
if verify(servern,'1234567890.')=0 then serv.0addr=servern /* use supplied number */
end
httpport=80
parse var server server ":" bport
if bport<>"" then httpport=bport
rc=1
if serv.0addr='' then do
if verify(server,'1234567890.')>0 then do /* non-numeric address */
rc=sockgethostbyname(strip(server), "serv.0")
if verbose>2 & rc<>0 then call printsay 'DNS lookup of 'server "==="serv.0addr
end
else do
serv.0addr=strip(server)
end
end
if rc=0 then do
if verbose>1 then
call printsay ' GET_URL_ADV sockin error in sockgethostbyname 'rc '('server
return "4"
end
end
/* or there is a proxy? */
if proxy<>'' then do
serv.0addr=' '
httpport=80
parse var proxy proxy ":" pport
if pport<>"" then httpport=pport
rc=1
if verify(proxy,'1234567890.')>0 then do
rc=sockgethostbyname(strip(proxy), "serv.0")
if verbose>2 & rc<>0 then call printsay 'DNS lookup of 'server "=="serv.0addr
end
else do
serv.0addr=strip(proxy)
end
if rc=0 then do
if verbose>2 then call printsay ' GET_URL_ADV sockin error in sockgethostbyname 'rc '('proxy
return "4"
end
/* and change request to include the original server */
request=strip(request,"l","/")
request='http://'||server||'/'||request
/* and possibly strip port off of server (server will be used in Host: header */
parse var server server ":" .
end /* end of proxy */
/* dns request? */
if type="DNS" then do
dd=serv.0addr
if httpport<>80 then dd=dd||':'||httpport
if verbose>2 then call printsay 'DNS lookup: 'server "=="serv.0addr
adot=left(dotserver,20,' ')
return '0 '||adot||'0d0a'x||dd
end
if mysem<>'' then do
aa=eventsem_query(mysem)
if aa<1 then do
adot=left(dotserver,20,' ')
if verbose>2 then call printsay "Semaphore cancel (1) = "aa
return '0 '||adot||' '
end
end
/* this is the stuff we really need */
dotserver=serv.0addr
gosaddr.0family=family
gosaddr.0port =httpport
gosaddr.0addr =dotserver
gosock = SockSocket(family, "SOCK_STREAM", 0)
select
when type='HEADGET' then do
type='GET' ; maxchar=999 ;
end
when type='DSCGET' then do
type='GET' ; maxchar=1499 ;
end
otherwise do
if totsize<>'' then do
if datatype(totsize)<>'NUM' then totsize=100
if totsize<1 then totsize=100
maxchar=totsize*1000
end
end
end
if proxy='' then do
request=strip(request,"l","/")
message=type" /"request" HTTP/1.0"crlf"HOST: "server||crlf
end
else do
message=type" "request" HTTP/1.0"crlf"HOST: "server||crlf
end
message=message||"Referer: GrabSite@"||mehost||crlf
if upwd<>"" then do
parse var upwd username password .
if password<>'' then do
aa=sre_pack64_make(strip(username)':'strip(password))
message=message||'Authorization: Basic '||aa||crlf
end
end
if moreheaders<>'' then do /* add more headers */
do until moreheaders=''
parse var moreheaders hhdr '0d0a'x moreheaders
hhdr=strip(translate(hhdr,' ','09001a'x))
if hhdr='' then iterate
message=message||hhdr||crlf
end
end
message=message||crlf
got=""
hh=Sockioctl(gosock,'FIONBIO',1)
if hh= -1 then do
call printsay 'crashed in ioctl 'errno
return '1'
end
rc = SockConnect(gosock,"gosaddr.0")
r.0=0;e.0=0;w.0=1;w.1=gosock
rcx=sockselect("R.","W.","E.",sock_time)
if rcx<=0 then do
rc1=sockshutdown(gosock,2)
rc = SockClose(gosock)
if verbose>2 then call printsay " GET_URL_ADV error on connecting to " server "=" rcx
return "2"
end
hh=Sockioctl(gosock,'FIONBIO',0)
rc = SockSend(gosock, message)
if rc<0 then do
goo=socksock_errno()
rc=sockshutdown(gosock,2)
rc = SockClose(gosock)
if verbose>1 then call printsay ' GET_URL_ADV sockin error ('gosock'): 'goo '(from:' server " "request
return '2 '||goo
end
if mysem<>'' then do
aa=eventsem_query(mysem)
if aa<1 then do
adot=left(dotserver,20,' ')
if verbose>2 then call printsay "Semaphore cancel (2) = "aa
return '0 '||adot||' '
end
end
YOW=bSOCKIN(GOSOCK,sock_time,maxchar,verbose,tottime,mysem) /* several vars are exposed */
parse var yow astat ',' got
rc=sockshutdown(gosock,2)
rc = SockClose(gosock)
if astat=0 then do /* 1=success, 2=stop at maxlen */
if verbose>1 then call printsay ' GET_URL_ADV sockin error ('gosock'): 'got '(from:' server " "request
return '3 '||got
end
if astat=7 then do /* 1=success, 2=stop at maxlen */
if verbose>1 then call printsay ' GET_URL_ADV sockin error ('gosock'): 'got '(from:' server " "request
return '7 '||got
end
if verbose > 2 then
call printsay " GET_URL_ADV: ("gosock") "type"; got" length(got) "bytes of response from:" server " "request
adot=left(dotserver,20,' ')
if astat=1 then
return '0 '||adot||'0d0a'x||got
else
return '6 '||adot||'0d0a'x||left(got,min(maxchar,length(got)))
/**************************/
/* bSOCKIN: a replacement for sockrecv.
call as
stuff=sockin(socket,timeout,maxlen,verbose,tottime)
where:
socket == a socket that's been established using sockconnect
timeout == a timeout value in seconds
maxlen == maximum length of message to recieve
If not specified, then no maximum is imposed
verbose == If 1, then report status messages.
If not specified, then do NOT report status messages
tottime == total amount of time allocated (across all sockrecv calls
and
stuff = the contents returned from the server (up to maxlen characters)
or an error message
Responses start with 1,
Error messages start with 0,-1 or 2, or 7
Note: timeout refers to maximum seconds between "sockrecv" requests.
It does NOT refer to total length of time required to recieve a message.
Thus, a number of medium length delays (say, a few seconds required
to complete each of several sockrecv requests) will NOT cause a timeout
(in other words, the timeout counter is reset upon each successful
completion of a 1000 byte sockrecv request).
*/
bSOCKIN:PROCEDURE expose logfile bold normal reverse cy_ye
PARSE Arg socket,timeout,maxlen,verbose,tottime,mysem
if maxlen=0 | maxlen='' then maxlen=100000000
if verbose>2 then call PRINTsay "Start read of socket " socket
if Sockioctl(socket,'FIONBIO',1) = -1 then /* switch to nonblocking mode */
Return '0,'||'crashed in ioctl 'errno
ict=0
ok=0
incoming=''
asec1=time('s')
timeout0=timeout /* reset timeout after each successsful socket call */
Do While TimeOut > 0
if mysem<>'' then do
aa=eventsem_query(mysem)
if aa<1 then do
if verbose>2 then call printsay "Semaphore cancel (3) = "aa
ok=-1
leave /* simulate timeout */
end
end
asec2=time('s')
if (asec2tottime then return '7,total time too long in SOCKIN 'asec2 ','asec1','tottime
res=Sockrecv(socket,'data',1000)
if res=-1 then do /* error condition ? */
If errno <> 'EWOULDBLOCK' THEN do /* real crash ? */
Return '0,'||'crashed in sockrecv 'errno /* yes */
end
/* not-fatal,no-data-available-condition: errno = EWOULDBLOCK & sockrecv returned -1 */
ict=ict+1
if verbose>2 then call PRINTsay 'SockIn: Waiting on 'socket' ('ict')...'
Call SysSleep 1
TimeOut = TimeOut - 1; /* count down my timer */
Iterate; /* loop again */
End; /* if sockrecv = -1 error */
if res=0 then do
ok=1 ; leave /* got end of message, so exit this do loop*/
end
if res<0 then do /* non EWOULDBLOCK error */
return '0,'||" Error in sockrecv " rc
end
incoming=incoming||data
if verbose>2 then call pRINTsay 'SockIn ('socket'): total data recieved: 'length(incoming)
if length(incoming)>maxlen then do
ok=2
leave
end
ict=0 /* reset display counter */
timeout=timeout0 /* reset timeout countdown */
End /* do while timeout > 0 */
/* here we are timed out, or got entire message, or violated maxlen */
if ok=1 then do /* got message */
If sockioctl(socket,'FIONBIO',0) = -1 then do /* switch to blocking mode */
if verbose>0 then call PRINTsay 'SockIn: ioctl error on switch to blocking mode: ' errno
end
return '1,'||incoming /* success! */
end
if ok=2 then do /* maxlen violation */
if sockioctl(socket,'FIONBIO',0) = -1 then do /* switch to blocking mode */
if verbose>0 then call PRINTsay 'SockIn: ioctl error on switch to blocking mode: ' errno
end
Return '2,'||incoming
end
if ok=-1 then Return '0,'||' forced timeout on sockrecv' /* forced timed out via semaphore */
Return '0,'||' timeout on sockrecv' /* timed out (ok=0) */
err1:
call printsay " error in get_url_adv at line "sigl ","rc
return '5'
/**************************************/
/* SIMPLE version, no longer used */
/* ---------------------------------------------*/
/* get a url from some site, return first
maxchar characters (if maxchar missing, get 10million (the whole thing?)
call as: stuff=get_url(aurl,maxchar,verbose,headers)
where:
aurl: the url to GET (required)
the other 3 are optional:
maxchar: max chars to get (default=10,000,000)
verbose: verbose mode (default=OFF)
headers: list of extra request headers, CRLF delimited
*/
/* ---------------------------------------------*/
old_get_url:procedure expose logfile bold normal reverse cy_ye
parse arg aurl,maxchar,verbose,headers
if maxchar="" then maxchar=10000000
got=""
if abbrev(translate(aurl),'HTTP://')=0 then do
if verbose>0 then call printsay "Error: URL not properly specified (it must begin with HTTP://)"
return ''
end
parse var aurl . '://' server '/' request
if VERBOSE>1 then do
if length(server||request)<65 then do
call printsay " GETting: " server ", " request
end
else do
call printsay " GETting: " server " "
call printsay " " request
end /* do */
end /* do */
/* now get the url. This requires the RxSock.DLL be in your LIBPATH. */
/* Load RxSock */
if \RxFuncQuery("SockLoadFuncs") then nop
else do
call RxFuncAdd "SockLoadFuncs","rxSock","SockLoadFuncs"
call SockLoadFuncs
end
crlf ='0d0a'x /* constants */
family ='AF_INET'
httpport=80
rc=sockgethostbyname(server, "serv.0") /* get dotaddress of server */
if rc=0 then do
call printsay ' Unable to resolve "'server'"'
return 0
end
dotserver=serv.0addr /* .. */
gosaddr.0family=family /* set up address */
gosaddr.0port =httpport
gosaddr.0addr =dotserver
gosock = SockSocket(family, "SOCK_STREAM", "IPPROTO_TCP")
/* Set up request */
message="GET /"request' HTTP/1.0 'crlf||'Host: 'server||crlf
if length(headers)>2 then do
if right(headers,2)=crlf then headers=left(headers,length(headers)-2)
end
if headers<>'' then message=message||headers||crlf
message=message||crlf
got=''
rc = SockConnect(gosock,"gosaddr.0")
if rc<0 then do
call printsay ' Unable to connect to "'server'"'
return 0
end
rc = SockSend(gosock, message)
/* Now wait for the response */
do r=1 by 1
rc = SockRecv(gosock, "response", 1000)
got=got||response
if rc<=0 then leave
tmplen=length(got)
if tmplen> maxchar then leave
end r
rc = SockClose(gosock)
return got
/**************************************/
/* Description of GET_URL_ADV:
Syntax:
stuff=get_url_adv(maxtime,maxsize,type,server,req,mehost,upwd,mhdrs, ,
verbose,qs,trn_Id,sock_time)
where ...
maxtime = Optional. maximum time (in seconds) to allocate for this request.
If not specified, 30 seconds is used. If the request takes longer then
maxtime to complete, stop and return an error message
maxsize = Optional. max length (in bytes) to allocate for this request.
If not specified, infinite (actually, 100M) bytes is used.
After maxsize, close connection and return an error
message and what you got (these approx. 100k bytes).
Type = type of request. Typically, HEAD or GET.
However, three special proprietary types can be specified:
DNS = lookup ip address of server. Return the IP address,
perhaps with :port appended
HEADGET = Use a GET request, but only return the HEAD (or
the first 999 characters, whichever is less)
DSCGET = Use a GET request, but only return the first
1499 characters
Server = the server to send the request to.
This can be a dns name, or an ipaddress
Alternatively, you can use the following more complete
syntax:
ip_name,ip_address,proxy_address
where:
ip_name : the ip name (or number) to send the request to.
If a non-80 port is used, append :port_number
(i.e.; foo.bar.net:8082)
ip_address : optional, the ip number for this ip name. If not
included, a DNS request is done (thus, use of
ip_address is a time saver), Do NOT append
the ip port (if needed, it should be appended
to ip_name)
proxy_address : Ip address of this proxy. If possible, give
the numeric address. If a name is given, then
a DNS request is done (thus, use of a numeric
address is a time saver)
If the proxy is using a port other then 80,
append :port_number to the proxy_address (even
if it's numeric).
For example: foo.bar.net:8080 or 152.15.22.16:8080
Notes:
* If you are NOT using a proxy server, leave proxy_address blank.
For example: foo.bar.net,189.12.51.62
* If you specify a proxy, then IP_address is ignored
* ip_name is required, even if it's the same as the ip_address (it is
used to form a Host: header)
req : the request string.
I.e.; the /mydir/index.sht portion of http://foo.bar.net/mydir/index.sht
upwd: Optional. Username password,
If username password, seperate them with a space.
For example: 'Joe Xegi3'
If specified, a Basic Authentication request header will be added.
Alternatively you could add an explicit header, in mhdrs, using:
aa=sre_pack64_make(strip(username)':'strip(passwd))
mdirs='Authentication: Basic '||aa
mehost: Optional. the callers ip address. This is used in a REFERER header,
mhdrs: optional. A crlf delimited list of headers, where each line has the
form Header_NAME: a string
verbose : Verbosity of intermediate output. Intermediate output is written
to the screen and log file.
0 = NO intermediate output (the default)
1,2,3 signal various levels (minimal, some, too much)
qs: Not required if a procedure call.
A two component argument, containing:
the queue to send results back on, and +
the semaphore to use for forcing timeouts.
This is NOT required if this is "called as a procedure", though
the semaphore component of the argument can be used (though it
is less likely to be useful)
The queue component is REQUIRED if this is "called as a daemon";
the semaphore component is not required, but if available it
will be used (it makes much more sense to use the semaphore
when calling as a deamon).
The syntax of qs_info must be:
Queue_name' 'semaphore_name
For example (without the " quotes):
"QCH_0193 /SEM32/SRE2002_32_02"
The optional semaphore_name should be an open and posted 32 bit
semaphore.
This procedure will poll this semaphore before all
socket calls, and if is NOT posted (or if it has been
closed), then an immediate timeout is forced.
If you do not include a semaphore_name, then you won't be able
to force a timeout.
Otherwise, everything else will work (the polling will
NOT occur).
trnid : Not required if a procedure call.
a transaction id. Should be unique to this transaction --
it's used to identify the response when returned through
a queue. Therefore, it is not needed when calling this
as a procedure.
sock_time : max time PER SOCKET REQUEST. MUST be less then tottime.
If not specified, sock_time_def is used
Returns:
If no error:
When called as a procedure:
0' 'ip_address||'0d0a'x||stuff
When called as daemon, get_url_adv will push
transaction||' 0 '||ip_address||'0d0a'x||stuff
to queue_name
stuff is response from the server
If error:
If maxlen, use 6 as error, but also send maxlen (more or less) bytes in STUFF
Else
Procedure call: return the status-code
Daemon call: push and transaction||' '||status_code||' '||error_message
Error codes:
0 = no error
1 = ioctl error
2 = connection error
3 = problem encountered in sockrecv
4 = sockgethostbyname error
5 = coding error in this procedure
6 = maxlen hit -- so only first maxlen characters will be returned
Examples:
Call as procedure:
moo=get_url_adv
(50,'GET','foo.bar.net,125.22.251.2','/abc/alpha.htm','mysite.net',,
'myname mypwd',,1,,,20)
as procedure, using a proxy:
oo=get_url_adv(50,'DNS','proxy.wow.com')
parse var oo . aproxy '0d0a'x .
oo=get_url_adv(50,'GET','foo.bar.net,,'||aproxy,'/abc/alpha.htm','mysite.net',,
'myname mypwd',,1,,,20)
*/
iserr:
say "error at line "sigl ',' rc
exit