h42632
s 00000/00000/01550
d D 1.15 03/05/02 20:19:59 abogaty 15 14
c 
e
s 00000/00000/01550
d D 1.14 03/05/02 02:41:29 abogaty 14 13
c 
e
s 00000/00000/01550
d D 1.13 03/04/22 17:54:33 abogaty 13 12
c 
e
s 00000/00000/01550
d D 1.12 03/04/12 01:12:04 abogaty 12 11
c reget_appendflag
e
s 00000/00000/01550
d D 1.11 03/02/04 21:41:50 abogaty 11 10
c 
e
s 00000/00000/01550
d D 1.10 02/09/18 02:00:43 abogaty 10 9
c -verifysizes
e
s 00000/00000/01550
d D 1.9 02/09/12 19:57:10 abogaty 9 8
c -texttoo flag to parse *.txt files as well
e
s 00000/00000/01550
d D 1.8 02/09/11 07:55:02 abogaty 8 7
c -relaxtime
e
s 00000/00000/01550
d D 1.7 02/09/11 04:47:08 abogaty 7 6
c use_gzip flag
e
s 00000/00000/01550
d D 1.6 02/09/03 21:28:37 abogaty 6 5
c -trim flag
e
s 00000/00000/01550
d D 1.5 02/06/20 03:42:19 abogaty 5 4
c nobreak_amp
e
s 00000/00000/01550
d D 1.4 02/06/08 03:30:37 abogaty 4 3
c Fix NWORDS => NJAVAWORDS
e
s 00000/00000/01550
d D 1.3 02/06/08 02:42:56 abogaty 3 2
c javascript:
e
s 00000/00000/01550
d D 1.2 02/05/25 04:30:46 abogaty 2 1
c no
e
s 01550/00000/00000
d D 1.1 02/05/21 20:01:33 abogaty 1 0
c date and time created 02/05/21 20:01:33 by abogaty
e
u
U
f e 0
t
T
I 1
/*
	Introduction.

	We are to remember ALL visited URLs in one big list.
	It is intended to exclude repetitions,
	since hypertext links can contain multiple links to
	the same location and to the same document,
	and even loops.

	We will add all URLs to the linked list and mark
	visited ones in it.
	However we will NEVER delete URLs from this list!
*/
#include "defs.h"

char HTTP_PREFIX_PURE [] = "http"   ;
char HTTP_PREFIX      [] = "http:"  ;

/* Create a new empty record about a hypertext reference */

URL *newURL(List *list, URL *parent, char *href, char *fullName){
	static unsigned long serial_number = 0;

	URL *ptr = (URL *) calloc(1, sizeof(URL));

	/* Join to the list */
	if(list->head == NULL){
		list->head = list->tail = ptr;
	} else {
		list->tail->next = ptr;
		list->tail       = ptr;
	}
	ptr->next = NULL;

	ptr->parentURL = parent;        /* list->currentURL        */
	ptr->counter   = 1;             /* reference counter       */
	ptr->serial    = ++serial_number;
	ptr->flags     = UNTOUCHED;     /* has not been copied yet */
	ptr->port      = HTTPPORT;
	ptr->retcode   = 000;
	ptr->level     = parent ? parent->level + 1 : 0;
	ptr->size      = 0;
	ptr->declared_size = 0;
	ptr->trys      = 0;
	ptr->hrefs     = 0;

#ifdef DEBUG
	ptr->shortName = strdup(href);
#endif
	ptr->fullName  = fullName;      /* already strdup()ped */

	ptr->hostName  = NULL;
	ptr->urlName   = NULL;
	ptr->ctype     = NULL;
	ptr->location  = NULL;

	ptr->address_list = NULL;

	if(verbose)
		fprintf(fplog, "\t+++ newURL [%06d]: %s\n", ptr->serial, fullName);

	return ptr;
}

/* ____________________________________________________________________________ */
/*
	http://host/docpath
	ftp://host/docpath
	file://docpath
	gopher://host/docpath
	wais://host/group
	mailto:user@host
	news:newsgroup

*/

char *URLtypes[] = {
	"http",

	"file",
	"ftp",
	"wais",
	"gopher",
	"mailto",
	"news",
	"telnet",
	"archie",
	"doc",          /* Hot Java's */
	NULL
};

Bool isFullURL(char *href, char **rest){
	char **s;
	*rest = NULL;

	for(s = URLtypes; *s; s++){
		int len = strlen(*s);
		if(strncasecmp(href, *s, len) == 0 && href[len] == ':'){

			href[len] = '\0';
			lowercase((unsigned char *)href);
			href[len] = ':';

			/* will be used in computeFullName() */
			*rest = href + len + 1;
			if((*rest)[0] == '/' && (*rest)[1] == '/')
				(*rest) += 2;   /* http://hostname/.... */
			/* Now at the beginning of a hostname. */

			return TRUE;
		}
	}
	return FALSE;
}

Bool isFullURL3(char *href){
	char **s;

	for(s = URLtypes; *s; s++){
		int len = strlen(*s);
		if(strncasecmp(href, *s, len) == 0 && strncmp(href+len, "://", 3) == 0){

			href[len] = '\0';
			lowercase((unsigned char *)href);
			href[len] = ':';

			return TRUE;
		}
	}
	return FALSE;
}

Bool isNotHTTP(char *href){
	return strncasecmp(href, HTTP_PREFIX_PURE,
			  STRLEN(HTTP_PREFIX_PURE)) == 0 ? FALSE : TRUE;
}

Bool isLegalHost(List *list, URL *ptr){
	char *hostname = ptr->hostName;

	if(recflag == TRUE) return TRUE;
	/* All hosts are allowed */

	/* Check if this is the NAME of our very first host
	 *               OR
	 * addresses of both hosts are the same.
	 */
	if(list->head){
		/* check the name */
		if(strcasecmp(list->head->hostName, hostname) == 0)
			return TRUE;

		/* check aliases */
		if(list->head->address_list && oneOfAddresses(hostname, list->head->address_list)){
			/* change hostname for the ptr to the canonical one */

			fprintf(fplog,  "\t@@@ NAME changed: %s --> %s\n",
					       hostname, list->head->hostName);
			fprintf(stderr, "@@@ NAME changed: %s --> %s\n",
					       hostname, list->head->hostName);
			free(ptr->hostName);
			ptr->hostName = strdup(list->head->hostName);

			return TRUE;
		}
		return FALSE;
	}
	/* list->head can have ANY name since it is the very first name ! */
	return TRUE;
}

/* ____________________________________________________________________________ */
/*
	href="..."              OR      href=...
	src="..."                       src=...
	background="..."                background=...
				But this must cause a warning message!

	Fully qualified names
	---------------------
	http://hostname/pathname
	ftp://hostname/pathname
	wais://hostname/pathname
	gopher://hostname/pathname
	mailto:emailaddress
	news:newsgroup

	Relative names (use current host)
	---------------------------------
	file://pathname
	/pathname
	pathname

	Jumps inside the body (ignore #labelname)
	-----------------------------------------
	...pathname#labelname

	Paths in MS DOS
	---------------
	http://host/C:/pathname

	Just at this time I have no one codeline to manage such
	a poor style.
*/

char ThisDir    [] = "./";
char ThisDir__  [] = ".";
char ParentDir  [] = "../";
char ParentDir__[] = "..";

char *computeFullName(List *list, URL *currentURL, char *href, int *addflags /* IN-OUT */){
	char *fullName, *s, *from;
	char *rest;

/*
	Must be done outside this function.
	*addflags = 0;
*/

#ifdef NOTDEF
	while(isspace(*href))
		href++;
#endif

	/* Curing the ill name... ========================= */
	if(ill_urls_flag){
		if( !strncmp(href, HTTP_PREFIX, STRLEN(HTTP_PREFIX))   &&
			     href[STRLEN(HTTP_PREFIX)  ] == '/'        &&
			     href[STRLEN(HTTP_PREFIX)+1] != '/'        &&
			     href[STRLEN(HTTP_PREFIX)+1] != '\0'
		){
			/*  href:/dir/doc
			 *       NOT
			 *  http://host/dir/doc
			 */
			fprintf(fplog, "\t@@@ Strange reference: %s, cutting to: %s\n", href, href + STRLEN(HTTP_PREFIX));
			href += STRLEN(HTTP_PREFIX);
		}
	}
	/* http:dir/file */
	if( !strncmp(href, HTTP_PREFIX, STRLEN(HTTP_PREFIX)) &&
	    href[STRLEN(HTTP_PREFIX)] != '\0'                &&
	    href[STRLEN(HTTP_PREFIX)] != '/'
	){
		fprintf(fplog, "\t@@@ Strange reference: %s, adding: %s\n", href, href + STRLEN(HTTP_PREFIX));

		addURL(list, href + STRLEN(HTTP_PREFIX), NoFlags);
	}
	/* END Curing the ill name... ===================== */

	if(isFullURL(href, &rest)) {            /* is a FULL NAME of URL */
		if(rest) lowercaseHostName(rest);

		fullName = strdup(href);
	}
	else if(currentURL) {                   /* must construct the full name */

		/* Then http://<currentHost>.....       supposed
		 * <currentHost> will be taken from the currentURL
		 */

		/* parsed earlier at the moment of adding of currentURL */
		char *host = currentURL->hostName;
		char *url  = currentURL->urlName;

		lowercase((unsigned char*)host);

		if(host == NULL){
			fprintf(fplog, "\t!!! Host is undefined (%lu)!!!\n", currentURL->serial);
			host = "nohost";
		}
		if(url == NULL){
			fprintf(fplog, "\t!!! URL is undefined (%lu)!!!\n", currentURL->serial);
			url = "/none.none";
		}
		/* Since we do use current URL, its full name (with stripped hostname)
		 * MUST begin with '/' -- from the root directory !!!
		 */
		if(*url != '/'){
			fprintf(fplog, "\t!!! URL name does not begin with '/' %s (%lu)!!!\n",
				       url, currentURL->serial);

			/* fix this */
			url = strspl("/", url, NULL);
			free(currentURL->urlName);
			currentURL->urlName = url;
		}

		if(*href == '/')        /* ABSOLUTE PATHNAME */

			fullName = strspl(HTTP_PREFIX, "//", host, href, NULL);

		else {                  /* RELATIVE PATHNAME */

			char *urlcopy = strdup(url);
			/* a copy for the destructive manipulations */

			/* ---------------------------------------------------------------------- */
			if(strncmp(href, ParentDir,   STRLEN(ParentDir  )) == 0 ||
			   strcmp (href, ParentDir__                     ) == 0 ||
			   strncmp(href, ThisDir,     STRLEN(ThisDir    )) == 0 ||
			   strcmp (href, ThisDir__                       ) == 0
			){

				int extractCurrentDir = TRUE;

#define EXTRACTCOMPONENT                                        \
	/* ...../xxxxx/yyyyy --> ...../xxxxx */                 \
	/* cut off the last component of the urlcopy */         \
								\
	s = strrchr(urlcopy, '/'); /* Cannot be NULL */         \
								\
	/* Don't touch the leading '/' -- root directory */     \
	if(s != urlcopy){                                       \
		*s = '\0';                                      \
	} else {                                                \
		/* leave alone '/' */                           \
		urlcopy[1] = '\0';                              \
	}
				/* ------------------------------------------------------- */
				/*
				 * HREF="./something.html"
				 *      is equal to
				 * HREF="something.html"
				 *
				 */
				while(strncmp(href, ThisDir, STRLEN(ThisDir)) == 0 ||
				      strcmp (href, ThisDir__) == 0
				){
					if(strcmp(href, ThisDir__) == 0)
						href += STRLEN(ThisDir__);
					else    href += STRLEN(ThisDir);

					while(*href == '/') href++;

				/* Stage 1: Extract the name of the current directory:
				 * currentURL="http://host/.../parent/thisdir/thisdoc.html"
				 * currentDIR="http://host/.../parent/thisdir"
				 */
					EXTRACTCOMPONENT;
					extractCurrentDir = FALSE;
				}

				/* ------------------------------------------------------- */
				/*
				 * HREF="../something.html"
				 *    where
				 * currentURL="http://host/.../parent/thisdir/thisdoc.html"
				 *
				 * Then the full name for the HREF must be
				 *
				 *       HREF="http://host/.../parent/something.html"
				 *
				 */
				while(strncmp(href, ParentDir, STRLEN(ParentDir)) == 0 ||
				      strcmp (href, ParentDir__) ==0
				){

					if(strcmp(href, ParentDir__) == 0)
						href += STRLEN(ParentDir__);
					else    href += STRLEN(ParentDir);

					while(*href == '/') href++;

					/* now HREF="something.html" or "" */

					if(extractCurrentDir == TRUE){

				/* Stage 1: Extract the name of the current directory:
				 * currentURL="http://host/.../parent/thisdir/thisdoc.html"
				 * currentDIR="http://host/.../parent/thisdir"
				 *
				 * This stage must be applied ONCE for the whole path.
				 */
						EXTRACTCOMPONENT;
						extractCurrentDir = FALSE;
					}

				/* ------------------------------------------------------- */
				/* Stage 2: Extract the name of parent directory:
				 * currentDIR="http://host/.../parent/thisdir"
				 * parentDIR ="http://host/.../parent"
				 */
					EXTRACTCOMPONENT;

					if(skipparentdir){
						fprintf(fplog, "\t@@@ Will be ignored because of href=\"..\"\n");
						*addflags |= IGNORED;
					}
				}
				/* ------------------------------------------------------- */

				s = strlast(urlcopy);
				if(*s== '/')
					fullName = strspl(HTTP_PREFIX, "//", host, urlcopy,      href, NULL);
				else    fullName = strspl(HTTP_PREFIX, "//", host, urlcopy, "/", href, NULL);

				fprintf(fplog, "\t;;; Converted to: %s\n", fullName);

			/* ---------------------------------------------------------------------- */
			} else {
				s = strlast(url);

				if(*s == '/')   /* is a directory */
						/* make a doc inside it */

					fullName = strspl(HTTP_PREFIX, "//", host, url, href, NULL);
					/* I remind again that url begins with '/' */

				else {          /* make a sibling of currentURL in the same dir */

					if((s = strrchr(urlcopy, '/')) != NULL)
						*s = '\0';
					/* even if s==urlcopy do here
					 *      "/" --> ""
					 */

					/* We extracted the name of the directory
					 * containing both the currentURL and the new one
					 */

					fullName = strspl(HTTP_PREFIX, "//", host, urlcopy, "/", href, NULL);

				}
			}
			/* ---------------------------------------------------------------------- */
			free(urlcopy);
		}

	} else {
otherwise:
		fprintf(fplog, "\t!!! Warning: unknown URL name format !!!\n");
		fullName  = strdup(href);
	}

	/*
		Strip out the anchor name:

			...host/urlname#label   ->      ...host/urlname
			...host/urlname/#label  ->      ...host/urlname
	*/
	if((from = strrchr(fullName, '/')) != NULL){
		if((s = strrchr(from, '#')) != NULL){
			if(!nowarnflag){
				fprintf(fplog,  "\t*** Warning: %s stripped to ", fullName);
				fprintf(stderr, "\t*** Warning: %s stripped to ", fullName);
				/* to be continued later... */
			}

			*s = '\0';

			if(s[-1] == '/')        /* ....../#...... */
			   s[-1]  = '\0';       /* ......         */

			if(!nowarnflag){
				fprintf(fplog,  "%s\n", fullName);
				fprintf(stderr, "%s\n", fullName);
			}
		}
	}
	if((from = strrchr(fullName, '/')) != NULL){
		if((s = strrchr(from, ';')) != NULL){
			if(!nowarnflag){
				fprintf(fplog,  "\t*** Warning: %s stripped to ", fullName);
				fprintf(stderr, "\t*** Warning: %s stripped to ", fullName);
				/* to be continued later... */
			}

			*s = '\0';

			if(s[-1] == '/')        /* ....../;...... */
			   s[-1]  = '\0';       /* ......         */

			if(!nowarnflag){
				fprintf(fplog,  "%s\n", fullName);
				fprintf(stderr, "%s\n", fullName);
			}
		}
	}

	return fullName;
}

/* ____________________________________________________________________________ */

/* Extract fields "hostName", "urlName", "port" from the "fullName" */

void parseName(URL *ptr){
	char *hostname, *urlname;
	char *scol, *sroot, *sport;

	char *name = ptr->fullName;
	/* must be qualified with http:// ftp:// etc */

retry:
	scol = strchr(name, ':');
	/*
		name
		|
		http://hostname/urlpathname
		    |
		    scol
	*/

	if(scol == NULL){
		fprintf(fplog, "\t@@@ Bad URL name format: %s\n", name);

		/* fix this */
		name = strspl(HTTP_PREFIX, "//", name, NULL);
		free(ptr->fullName);
		ptr->fullName = name;

		fprintf(fplog, "\t@@@ Supposing %s\n\n", name);
		goto retry;

	} else {
		*scol = '\0';

		/* Flag BASEHREF may come from the <BASE HREF=...> */
		if(isNotHTTP(name) && !(ptr->flags & BASEHREF)){
			ptr->flags |= IGNORED;

			*scol = ':';    /* restore */
			fprintf(fplog, "\t*** Don't process this kind of URLs: %s\n\n", name);

			return;
		}
		name = scol+1;
		*scol = ':';    /* restore */

		/*
			     name
			     |
			http://hostname/urlpathname
			    |
			    scol
		*/
	}
	while(*name == '/') name++;

	sroot = strchr(name, '/');
	/*
		       name
		       |
		http://hostname/urlpathname
		    |          |
		    scol       sroot
	*/
	if(sroot == NULL){
		lowercase((unsigned char *)name);       /* hostname */
		    hostname = strdup(name);
		    urlname  = strdup("/");             /* HOME PAGE */

		 /* urlname  = strdup("/" INDEXNAME); */


	} else {
		*sroot = '\0';
		    lowercase((unsigned char *)name);   /* hostname */
		    hostname = strdup(name);
		*sroot = '/';

		while(*sroot == '/') sroot++;
		sroot--;
		    urlname = strdup(sroot);
	}

	/* hostname may be in the form HOST:port */
	if((sport = strrchr(hostname, ':')) != NULL){
		int nport;

		*sport = '\0';
		if(reassignportsflag){
			ptr->port = htons(nport = atoi(sport+1));
#ifdef DEBUG
			fprintf(fplog, "\t*** Port number %d is declared for %s\n\n",
							  nport,             ptr->fullName);
#endif
		}

	} /* else ptr->port = HTTPPORT; (has already been done by newURL) */

	/* Now, remember the parsed parts of the "ptr->fullName" in the "ptr". */

	if(ptr->hostName == NULL)
		ptr->hostName = hostname;
	else    free(hostname);

	if(ptr->urlName == NULL)
		ptr->urlName = urlname;
	else    free(urlname);
}

/* ____________________________________________________________________________ */

/* Add URL record to the list. Check if it is already there */

int add_counter;

/* <base href=...> (if any) affects computing of computeFullName() func. */

URL *addURL(List *list, char *href, int addflags /* usually = 0 */){
	char *fullName;
	URL *ptr;

	/* Compute the full name of the new URL */
	/* computeFullName() returns malloc()ed string */

	fullName = computeFullName(list,
				   basehref ? basehref : list->currentURL,
				   href,
				   &addflags /* usually =0 */
				   );
	if(fullName == NULL || !*fullName)
		return NULL;

	if(verbose)
		fprintf(fplog, "\t+++ addURL: %s --> %s\n", href, fullName);
/* ??? */
	if((addflags & IGNORED) && !(addflags & FORCED))
		/* computeFullName() told us that this name is not to process */
		return NULL;
/* ??? */

	/* Let's think a moment if there can be an empty URL name. Yes, it can:
		HREF="#label12"
		(reference to the other point inside the same document)
		After the striping out '#' it becomes "".
	 */

	add_counter++;  /* count added names */

	/* search in the list if this URL is already there */
	for(ptr=list->head; ptr != NULL; ptr=ptr->next){
#ifdef DEBUG_CMP
		if(verbose) fprintf(fplog, "\t--- Check: %04d '%s', '%s'\n", ptr->serial, fullName, ptr->fullName); /* @ABS@ */
#endif

		if(strcmp(ptr->fullName, fullName) == 0){

			/* yes, already there */
			if(verbose)
				fprintf(fplog, "\t+++ THERE: %s\n", fullName);
			ptr->counter++;
			free(fullName);

			return ptr;     /* existing one */
		}
	}

	/* else add new one */
	ptr = newURL(list, list->currentURL, href, fullName);

	/* extract parts of the full name
	   and canonize them
	 */
	parseName(ptr);
	if(underflag && ptr == list->head){
		char *ss;
		underdir = strdup(ptr->urlName);
		ss = strrchr(underdir, '/');
		if(ss) ss[1] = '\0';

		fprintf(fplog,  "@@@ Restrict paths to directory: %s\n", underdir);
		fprintf(stderr, "@@@ Restrict paths to directory: %s\n", underdir);
	}

	if(skipparents){
		if(isParent(list->currentURL, ptr)){
			fprintf(fplog, "\t@@@ Will be ignored because of href to parent\n");
			addflags |= IGNORED;
		}
	}
	ptr->flags |= addflags;

	(void) skipExisting(list, ptr);

	return ptr;
}

Bool skipExisting(List *list, URL *ptr){

	if(appendflag &&
	   isUntouched(ptr)  &&           /* legal document (to be processed at all) */
	   ptr != list->head &&           /* however DO process the first document   */
	   ptr->hostName && ptr->urlName  /* are defined                             */
	){

		/* Test if such FILE already exists and is not zero-length */

		char outname[MAXPATHLEN], *postfix = "";
		struct stat st;
		Bool isIndex;

		isIndex = computeFileName(outname, ptr->hostName, ptr->urlName, ptr->port, postfix);

		if(weak_appendflag){

			if(isIndex == TRUE)     /* reget dirs always */
			      return FALSE;

			/* always reget HTML files (but not others) */

			if(   strsuffix(ptr->urlName, ".html")
			   || strsuffix(ptr->urlName, ".htm")
			   || strsuffix(ptr->urlName, ".shtml")
			   || strsuffix(ptr->urlName, ".rhtml")
			)     return FALSE;
		}

		if(stat(outname, &st) >= 0 && st.st_size > 0 && !isdir(st)){
			/* Yes, it does exist.
			 * Don't process this URL
			 */
			fprintf(fplog, "\t*** File exists, skip: %s\n\n", ptr->fullName);

			ptr->flags |= EXISTS;

			return TRUE;    /* do skip it */
		}
	}
	return FALSE;
}

/* ____________________________________________________________________________ */
/* The interface for the first call from main() */

void addFirst(List *list, char *href){
	list->currentURL = NULL;
	list->head = list->tail = NULL;

	(void) addURL(list, href, NoFlags);
}
void loadURLFile(List *list, char *filename){
	FILE *fp;
	char buffer[5 * 1024], *s;
	URL *save, *added;

	if((fp = fopen(filename, "r")) == NULL){
		myperror(filename);
		return;
	}
	while(fgets(buffer, sizeof buffer, fp) != NULL){
		if((s = strchr(buffer, '\n')) != NULL)
			*s = '\0';

		save = list->currentURL;
		list->currentURL = list->tail;

		added = addURL(list, buffer, NoFlags);
		fprintf(stderr, "### Added %s\n", added->fullName);

		list->currentURL = save;
	}
	fclose(fp);
}

/* ____________________________________________________________________________ */

/* Get URL from the WWW server                */
/* Simultaneously find all the HREFs included */

char CGIbin         [] = "/cgi-bin/";
char CGIbin_images  [] = "/cgi-bin/images/";
char CGIbin_imagemap[] = "/cgi-bin/imagemap/";

void processURL(List *list, URL *ptr /* = list->currentURL */ ){

	char *fullName = ptr->fullName;
	/* and parsed parts */
	char *hostname = ptr->hostName;
	char *urlname  = ptr->urlName;

	int rest, to_retry;

	rest = countRemaining(list, &to_retry);

	fprintf(fplog, "### %s \"%s\"\n### [%lu.%d:%d] %s\n### [%d left; %d to retry]\n",
		ptr->trys > 0 ? "Retrying" : "Processing",
		fullName,
		ptr->serial, ptr->level, ptr->trys,
		currentDate(),
		rest, to_retry
	);
	if(debugflag > 1) fprintf(stderr, "Processing %s\n", fullName);

	if((ptr->flags & IGNORED) || isNotHTTP(fullName)){
		fprintf(fplog, "\t@@@ Don't process this kind of URLs: %s\n\n", fullName);

		ptr->flags |= PROCESSED; /* processed */
		return;
	}
	if(checkIfToSkip(ptr) == TRUE){
		ptr->flags |= (PROCESSED|IGNORED); /* processed */
		return;
	}

	if(skiprootdir && (strcmp(urlname, "/") == 0 || strcmp(urlname, "/" INDEXNAME) == 0)){
		fprintf(fplog, "\t@@@ Dont parse HREFs in root index: %s\n\n", fullName);

		ptr->flags |= DONTPARSE;
		/* Get the document itself, but don't follow its HREFs */
	}

	fprintf(fplog, "\tHOST: %s:%u\n", hostname, ntohs(ptr->port));
	fprintf(fplog, "\tURL:  %s\n\n",  urlname);

	/* href=http://host/cgi-bin/imagemap/dir/name.map
		must be really get as the file
		http://host/dir/name.map
	 */
	if(strncasecmp(urlname, CGIbin_imagemap, STRLEN(CGIbin_imagemap)) == 0){
		addURL(list, urlname + STRLEN(CGIbin_imagemap) - 1, FORCED);
		fprintf(fplog, "\t@@@ Adding Image Map: %s\n", urlname + STRLEN(CGIbin_imagemap) - 1);
	}

	/* no "else" here */

	if(skipcgiflag &&
		(strncasecmp(urlname, CGIbin,        STRLEN(CGIbin))        == 0 &&          /* equals */
		 strncasecmp(urlname, CGIbin_images, STRLEN(CGIbin_images)) != 0             /* not equals */
		)
	){
		fprintf(fplog, "\t@@@ Don't process CGI scripts\n");
		ptr->flags |= IGNORED;

	} else  if(isLegalHost (list, ptr) == TRUE ){   /* check hostname = ptr->hostName */

		if(underdir && ptr->urlName &&
			! strsuffix(ptr->urlName, ".txt") &&    /* suffix is NOT .txt */
			! isimage(ptr)                    &&
			! (ptr->flags & REDIRECTEDTO)     &&    /* is not redirected URL */
			strncmp(ptr->urlName, underdir, strlen(underdir)) != 0   /* differs */
		){
			/* Ignore it */
			ptr->flags |= IGNORED;
			fprintf(fplog, "\t@@@ Ignored, not under %s\n", underdir);
		}

		else

		/* Check again after the possible name change */
		if(skipExisting(list, ptr) == FALSE){    /* don't skip this URL */
			int code = callHost(list, ptr);
			ptr->flags |= code;
		}
	} else {
		fprintf(fplog, "\t@@@ Don't look at host: %s\n", hostname);
		ptr->flags |= SKIPPED;
	}
	fprintf(fplog, "\n");

	ptr->flags |= PROCESSED; /* processed */

	if(ptr->flags & RETRY) fprintf(stderr, "### This document will be retried\n\n");

/* DEBUGGING SECTION */
	reportShortList(list, fpreport);
}

/* ____________________________________________________________________________ */

void mainLoop(List *list){

again:
	for(;;){
		if(error_counter > 5){
			fprintf(fplog,  "### TOO MANY WRITE ERRORS, EXITTING\n");
			fprintf(stderr, "### TOO MANY WRITE ERRORS, EXITTING\n");

			die(0);
		}

		/* Check REGETs */
		for(list->currentURL =  list->head;
		    list->currentURL != NULL;
		    list->currentURL =  list->currentURL->next)
		{
			if(list->currentURL->flags & REGET){

				if(list->currentURL->trys < 2){  /* retry only once */

					fprintf(fplog,  "### Regetting\n");
					fprintf(stderr, "### Regetting\n");

					setUntouched(list->currentURL);

					/* Try to get !RANDOM or size != 0 */
					processURL(list, list->currentURL);

					list->currentURL->flags &= ~REGET;      /* only once: erase new REGET */

					goto again;

				} else  list->currentURL->flags &= ~REGET;
			}
		}

		/* Check UNTOUCHED and RETRYs */
		for(list->currentURL =  list->head;
		    list->currentURL != NULL;
		    list->currentURL =  list->currentURL->next)
		{
			if((list->currentURL->flags & RETRY) && list->currentURL->trys < RETRY_IMMEDIATELY){
				/* retry it immediately */
				setUntouched(list->currentURL);
				processURL(list, list->currentURL);
				break;
			}
			/* else */
			if(isUntouched(list->currentURL)){
				processURL(list, list->currentURL);
				break;
			}
		}
		if(list->currentURL == NULL)
		/* done - the tail of the list is reached */
			break;
	}
}

int countRemaining(List *list, int *to_retry){
	URL *ptr;
	int count = 0, count_retry = 0;

	for(ptr=list->head; ptr; ptr=ptr->next){
		if(isUntouched(ptr))
			count++;
		if(ptr->flags & RETRY)
			count_retry++;
	}
	if(to_retry)
		*to_retry = count_retry;
	return count;
}

/* Mark URLs as not tried */
void retryNotReceived(List *list){
	URL *ptr;

	fprintf(stderr, "\n### NEXT PASS ###\n\n");
	for(ptr=list->head; ptr; ptr=ptr->next)
		if(ptr->flags & RETRY){
			if(ptr->trys == MAXTRYS){
				fprintf(fplog, "@@@ Too many retrials: %s\n", ptr->fullName);
				ptr->flags &= ~RETRY;
				continue;
			}
			setUntouched(ptr);
			/* i.e. ready for retrial */
		}
}
/* ____________________________________________________________________________ */

Bool computeFileName(char *outname, char *hostname, char *urlname, u_short port, char *postfix){
	char *s;
	char happendx[32];

	if(reassignportsflag && HTTPPORT != port){
		sprintf(happendx, ":%d", ntohs(port));
	} else  sprintf(happendx, "");

	s = strlast(urlname);
	if(mylogdir){
		if(*s == '/'){
			sprintf(outname, "%s/%s%s%s%s%s",
				mylogdir, hostname, happendx, urlname, INDEXNAME, postfix);
			return TRUE;    /* is index */
		} else {
			sprintf(outname, "%s/%s%s%s%s",
				mylogdir, hostname, happendx, urlname, postfix);
			return FALSE;    /* is not index */
		}
	} else {
		if(*s == '/'){
			sprintf(outname, "%s/%s/%s%s%s%s%s",
				HOME, SPOOLDIR, hostname, happendx, urlname, INDEXNAME, postfix);
			return TRUE;    /* is index */
		} else {
			sprintf(outname, "%s/%s/%s%s%s%s",
				HOME, SPOOLDIR, hostname, happendx, urlname, postfix);
			return FALSE;    /* is not index */
		}
	}
}

char ContentLength [] = "Content-length: ";
char ContentType   [] = "Content-type: ";
char HTTPreport    [] = "HTTP/1.0 ";
char TEXT_HTML     [] = "text/html";
char X_DIRECTOR    [] = "application/x-director";
char Location      [] = "Location: ";

int processDocument(List *list, URL *urlptr, char *filename){
	FILE *fpin, *fpout;
	State state = HEADER;
	char buffer[10000];
	char outname[MAXPATHLEN * 2], *postfix = "";
	char *s;
	size_t size      = NOSIZE;
	Bool is_html     = FALSE;
	Bool dosave      = TRUE;
	Bool redirection = FALSE;
	int c;
	HttpCode retcode = HTTP_OK;             /* 000 ??? */
	struct stat st;
	size_t nlines = 0L;

	char *hostname = urlptr->hostName;
	char *urlname  = urlptr->urlName;

	if((fpin = fopen(filename, "r")) == NULL){
		fprintf(fplog, "*** Cannot open %s for %s:%s\n", filename, hostname, urlname);
		return (-1);
	}
	while(fgets(buffer, sizeof buffer, fpin) != NULL){
		if((s = strchr(buffer, '\n')) != NULL) *s = '\0';
		if((s = strchr(buffer, '\r')) != NULL) *s = '\0';
		nlines++;

		if(debugflag >= 3)
			fprintf(fplog, ":::\t%s\n", buffer);

		if(strncasecmp(buffer, ContentLength, STRLEN(ContentLength)) == 0){
			urlptr->declared_size = size = atol(buffer + STRLEN(ContentLength));

			if(debugflag >= 2){
				fprintf(stderr, "\t%s\n", buffer);
				fprintf(fplog,  "\t%s\n", buffer);
			}
		} else
		if(strncasecmp(buffer, ContentType, STRLEN(ContentType)) == 0){
			s = buffer + STRLEN(ContentType);

			urlptr->ctype = strdup(s);

			     if(strncasecmp(s, TEXT_HTML, STRLEN(TEXT_HTML)) == 0)
				is_html = TRUE;
			else if(strncasecmp(s, X_DIRECTOR, STRLEN(X_DIRECTOR)) == 0)
				is_html = TRUE;


			if(debugflag){
				fprintf(stderr, "\t%s\n", buffer);
				fprintf(fplog,  "\t%s\n", buffer);
			}
		} else
		if(strncasecmp(buffer, Location, STRLEN(Location)) == 0){
			s = buffer + STRLEN(Location);

			urlptr->location = strdup(s);

			if(debugflag){
				fprintf(stderr, "\t%s\n", buffer);
				fprintf(fplog,  "\t%s\n", buffer);
			}

		} else

	/* HTTP/1.0 200 Document Follows
	   HTTP/1.0 301 Moved Permanently
	   HTTP/1.0 404 Not Found
	   HTTP/1.0 302 Found
	 */
		if(strncasecmp(buffer, HTTPreport, STRLEN(HTTPreport)) == 0
			&&
		   isdigit(buffer[STRLEN(HTTPreport)] )
		){
			if(debugflag)
				fprintf(stderr, "\t%s\n", buffer);

			retcode = atoi(buffer + STRLEN(HTTPreport));

			if(retcode == HTTP_FOUND || retcode == HTTP_MOVED){
				redirection = TRUE;
				dosave      = FALSE;
			}

			if(retcode != HTTP_OK){
				if(keepflag){
					postfix = ".ERR";
					/* but dosave = TRUE; */
				} else {
					dosave = FALSE;
					fprintf(fplog, "\t@@@ Don't save it: %s\n", buffer);
				}
				urlptr->flags |= HTTPERROR;
			}
		}

		/* Empty line --> end of the header */
		if(!*buffer){
			state = BODY;
			break;
		}
	}
	urlptr->retcode = retcode;

	urlptr->hrefs = 0;
	add_counter = 0;        /* see addURL() */

	/* ................................................................. */
	if(redirection == TRUE){

		if(redirsaveflag){
			dosave = TRUE;
			urlptr->flags |= DONTPARSE;
		}

		if(urlptr->location == NULL){

			fprintf(fplog,  "\t@@@ Redirected %s has no redirection URL\n", urlptr->fullName);
			fprintf(stderr, "\t@@@ Redirected %s has no redirection URL\n", urlptr->fullName);

			if( *strlast(urlptr->fullName) != '/'){
				URL  *saveptr = list->currentURL;
				char *newname;

				list->currentURL = urlptr;
				newname = strspl(urlptr->fullName, "/", NULL);

				fprintf(fplog,  "\t@@@ Redirected to Index: %s\n", newname);
				fprintf(stderr, "\t@@@ Redirected to Index: %s\n", newname);

				addURL(list, newname, REDIRECTEDTO);
				free(newname);

				list->currentURL = saveptr;
			}

		} else {

			/* 302
			   Location: http://server/dir/doc
			   Location: /dir/doc                   (means http://thisserver/dir/doc)
			   Location: http:/dir/doc              (means http://thisserver/dir/doc)
			 */

			if(isFullURL3(urlptr->location)){
				fprintf(fplog,  "\t@@@ Redirection: %s -> %s\n", urlptr->fullName, urlptr->location);
				fprintf(stderr, "\t@@@ Redirection: %s -> %s\n", urlptr->fullName, urlptr->location);

				urlptr->flags |= REDIRECTED;

				addURL(list, urlptr->location, REDIRECTEDTO);

			} else {
				URL  *saveptr = list->currentURL;
				char *newname;

				/* Location: http:/newdir/newurl */
				if( strncmp(urlptr->location, HTTP_PREFIX, STRLEN(HTTP_PREFIX)) == 0)
					urlptr->location += STRLEN(HTTP_PREFIX);

				list->currentURL = urlptr;

				newname = strspl(HTTP_PREFIX, "//", urlptr->hostName, urlptr->location, NULL);

				fprintf(fplog,  "\t@@@ Redirection: %s -> %s\n", urlptr->fullName, newname);
				fprintf(stderr, "\t@@@ Redirection: %s -> %s\n", urlptr->fullName, newname);

				urlptr->flags |= REDIRECTED;

				addURL(list, newname, REDIRECTEDTO);
				free(newname);

				list->currentURL = saveptr;
			}
		}
	}
	/* ................................................................. */

	/* Save the rest into the cache file */
	/* This is the aid of the whole this program !!! */
	if(dosave == TRUE){
		char renamed_name[sizeof(outname) + 10];
		Bool renamed;
		Bool parsed = FALSE;

		computeFileName(outname, hostname, urlname, urlptr->port, postfix);

		/* Create necessary directories */
		renamed = makepath(outname, redirection, renamed_name);

		if((fpout = fopen(outname, "w")) == NULL){
			myperror(outname);
			fclose(fpin);
			return (-1);
		} else {
			fprintf(fplog, "\t*** Saving to: %s\n", outname);
		}

		/* Here do:
		 *      save the URL file.
		 *      if it is HTML file - look for the HREF= in it.
		 */
		resetParser();
		urlptr->flags &= ~NOTPARSED;

		while((c = getc(fpin)) != EOF){

			putc(c, fpout);

			if(ferror(fpout)){
				fprintf(stderr, "### WRITE ERROR: %s\n", outname);
				fprintf(fplog,  "### WRITE ERROR: %s\n", outname);
				myperror("putc");
				error_counter++;

				break;
			}

			if(is_html && (urlptr->flags & DONTPARSE) == 0){
				checkChar(list, c & 0xFF);    /* see parse.c */
				parsed = TRUE;
			}
		}
		fclose(fpout);

		if(parsed == FALSE)
			urlptr->flags |= NOTPARSED;

		if(renamed == TRUE && compareFiles(outname, renamed_name) == TRUE){
			fprintf(fplog,  "\t@@@ New and Old are the same; unlinking Old\n");
			fprintf(stderr, "\t@@@ New and Old are the same; unlinking Old\n");
			unlink(renamed_name);
		}

		/* test for the correct size */
		stat(outname, &st);
		if(size == NOSIZE){
			fprintf(fplog, "\t*** Size: %lu (RANDOM)\n", st.st_size);
			urlptr->flags |= RANDOMSIZE;

			if(st.st_size == 0)     /* probably there was a packet jam */
				urlptr->flags |= REGET;

		} else {
			fprintf(fplog, "\t*** Size: %lu , declared: %lu (%s)\n",
				st.st_size, size,
				size == st.st_size ? "OK" : "ERROR"
			);
			urlptr->flags &= ~RANDOMSIZE;

			if(size != st.st_size){
				urlptr->flags |= (WRONGSIZE|RETRY);
				fprintf(stderr, "\t!!! Wrong size: %s (expected %lu, got %lu)\n",
								   urlptr->fullName,
								   size, st.st_size);
				if(retryflag){
					fprintf(fplog, "\t@@@ Corrupted file deleted: %s\n",
						       outname);
					unlink(outname);
				}
			}
		}
		urlptr->size = st.st_size;

		if(picsonlyflag && isreg(st) && !isimage(urlptr)){
			unlink(outname);
			fprintf(fplog,  "\t@@@ Unlinking non Picture %s\n", outname);
			fprintf(stderr, "\t@@@ Unlinking non Picture %s\n", outname);
		}

	}
	fclose(fpin);

	urlptr->hrefs = add_counter;
	if(add_counter > 0 && debugflag)
		fprintf(stderr, "\t%d references\n", add_counter);

	basehref = NULL;        /* Reset to "currentURL".
				   "base href=" is local to one document */

	return 0;
}

/* ____________________________________________________________________________ */

void reportList(List *list){
	if(debugflag > 3)
		reportLongList(list, fplog);
	reportShortList(list, fpreport);
}

void reportLongList(List *list, FILE *fp){
	URL *ptr;

	fprintf(fp, "---WWW SUMMARY REPORT---------------------------\n\n");
	for(ptr=list->head; ptr; ptr=ptr->next){

		if(ptr == list->currentURL)
			fprintf(fp, "* ");

		fprintf(fp, "%03d %03d #%lu", ptr->retcode, ptr->level, ptr->serial);
		if(ptr->parentURL)
			fprintf(fp, " --> #%lu", ptr->parentURL->serial);

		fprintf(fp, "\n");
		fprintf(fp, "%s\n", ptr->fullName);
#ifdef DEBUG
		fprintf(fp, "%s\n", ptr->shortName);
#endif
		if(ptr->hostName) fprintf(fp, "\thost:\t%s\n", ptr->hostName);
		if(ptr->urlName)  fprintf(fp, "\turl:\t%s\n",  ptr->urlName);
		if(ptr->location) fprintf(fp, "\tloc:\t%s\n",  ptr->location);
		if(ptr->ctype)    fprintf(fp, "\ttype:\t%s\n", ptr->ctype);
		if(ptr->trys > 0) fprintf(fp, "\ttrys:\t%d\n", ptr->trys);

		fprintf(fp, "\tcount:\t%d\n", ptr->counter);
		fprintf(fp, "\tflags:\t%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
					   (ptr->flags & CONNECTED)     ? "connected ":"",
					   (ptr->flags & FAILED)        ? "failed "   :"",
					   (ptr->flags & SKIPPED)       ? "skipped "  :"",
					   (ptr->flags & IGNORED)       ? "ignored "  :"",
					   (ptr->flags & EXISTS)        ? "exists "   :"",
					   (ptr->flags & UNKNOWN)       ? "unknown "  :"",
					   (ptr->flags & WRONGSIZE)     ? "badsize "  :"",
					   (ptr->flags & HTTPERROR)     ? "HTTPfail " :"",
					   (ptr->flags & RETRY)         ? "retry "    :"",
					   (ptr->flags & DONTPARSE)     ? "dontparse ":"",
					   (ptr->flags & NOTPARSED)     ? "notparsed ":"",
					   (ptr->flags & RANDOMSIZE)    ? "random "   :"",
					   (ptr->flags & BASEHREF)      ? "base "     :"",
					   (ptr->flags & REDIRECTEDTO)  ? "redir "    :""
					/*
					   (ptr->flags & PROCESSED) ? "seen "     :""
					 */
		);
		fprintf(fp, "\n");
	}
}

void reportShortList(List *list, FILE *fp){
	URL *ptr;

	rewind(fp);
	for(ptr=list->head; ptr; ptr=ptr->next){

		fprintf(fp,
			"%c %c%c%c%c %c%03d %03d %0d %7u %7u %03d %06lu %06lu %s",
			 ptr->retcode == HTTP_OK    ? 's': /* Success    */
			 ptr->retcode == 0          ? '-': /* Not tried  */
			 ptr->retcode == HTTP_FOUND ? '>': /* Redirected */
						      'f', /* HTTP request Failed */
			    (ptr->flags & RETRY)                                ? 'R':
			    (ptr->flags & IGNORED)                              ? 'I':
			    (ptr->flags & SKIPPED)                              ? 'S':
			    (ptr->flags & EXISTS)                               ? 'E':
			    (ptr->flags & REDIRECTED)                           ? '>':
			    (ptr->flags & CONNECTED)                            ? 'c':
			    (ptr->flags & UNKNOWN)                              ? 'u':'-',

			      (ptr->flags & FAILED)                             ? 'f':'-',

				(ptr->flags & RANDOMSIZE)                       ? '~':
				(ptr->flags & WRONGSIZE)                        ? '%':
				(ptr->flags & HTTPERROR)                        ? '+':
				((ptr->flags & CONNECTED) && (ptr->size == 0))  ? '@':'-',

				  (ptr->flags & DONTPARSE)                      ? 'D':
				  (ptr->flags & NOTPARSED)                      ? 'N':'-',

				     (ptr->flags & REDIRECTEDTO) ? '#' :
				     (ptr->flags & BASEHREF    ) ? '$' :
								   '=' ,

				      ptr->retcode,
					   ptr->level,
						ptr->trys,
						    ptr->size,
							ptr->declared_size,
							    ptr->hrefs,
								 ptr->serial,
								       ptr->parentURL ? ptr->parentURL->serial : 0,
									     ptr->fullName);

		if(ptr->ctype) fprintf(fp, " [%s]\n", ptr->ctype);
		else           fprintf(fp, "\n");
	}
	fflush(fp);
}

/* ____________________________________________________________________________ */

Item *skip_list;

void addSkipItem(char *s, int flag){
	Item *newptr;

	newptr = (Item *) calloc(1, sizeof(Item));

	newptr->text     = strdup(s);
	newptr->length   = strlen(s);
	newptr->flags    = flag;
	newptr->is_a_dir = (*strlast(s) == '/' ? TRUE : FALSE);
	/*      /...../dir/     or
		/...../doc.html
	 */

	newptr->next = skip_list;
	skip_list = newptr;
}

/*
	returns:

		TRUE            -       set flag IGNORED;   do return

		FALSE           -       set flag DONTPARSE; continue
				-       no flags;           continue

*/
Bool checkIfToSkip(URL *urlptr){
	Item *ptr;

	for(ptr=skip_list; ptr; ptr=ptr->next){

		/* Exact match of a dir name */
		if(ptr->flags & THISONLY){

			if(ptr->is_a_dir == TRUE &&
			   strcmp(urlptr->urlName, ptr->text) == 0)
				goto found;
			else
				continue;
		}

		/* ------------------------------------------- */
		/*
			text:           /AAA/
			urlName:        /AAA/.......
		*/
		if(ptr->is_a_dir == TRUE  &&
		   strncmp(urlptr->urlName, ptr->text, ptr->length) == 0)
			goto found;

		/*
			text:           /AAA/BBB.html
			urlName:        /AAA/BBB.html
		*/
		if(ptr->is_a_dir == FALSE &&
		   strcmp(urlptr->urlName, ptr->text) == 0)
			goto found;

		/*
			text:           /AAA/BBB.html
			urlName:        /AAA/BBB.html#......
		*/
		if(ptr->is_a_dir == FALSE &&
		   strncmp(urlptr->urlName, ptr->text, ptr->length) == 0 &&
		   urlptr->urlName[ptr->length] == '#')
			goto found;

	}
	return FALSE;

found:
	if(ptr->flags & IGNORED){
		fprintf(fplog, "\t@@@ Skipping %s under %s\n\n", urlptr->urlName, ptr->text);
		urlptr->flags |= IGNORED;
		return TRUE;
	}
	if(ptr->flags & DONTPARSE){
		fprintf(fplog, "\t@@@ Don't parse HREFs in %s under %s\n\n", urlptr->urlName, ptr->text);
		urlptr->flags |= DONTPARSE;
		return FALSE;
	}
	return FALSE;
}
/* ____________________________________________________________________________ */

Bool isParent(URL *current, URL *ptr){
	char *currentName;
	char *ptrName;
	char *last;
	int length;

	if(current == NULL || ptr == NULL)
		return FALSE;

	currentName = current->urlName;
	ptrName     = ptr->urlName;

	if(currentName == NULL || ptrName == NULL)
		return FALSE;

	last   = strlast(currentName);
	length = strlen (ptrName);

	if(*last == '/'){
		/* current = "/aaa/bbb/ccc/"
		   Then we must ignore

		   href    = "/aaa/bbb/"
		   href    = "/aaa/"
		   href    = "/"
		*/

		if(
		   /* reference to the directory */
		   *strlast(ptrName) == '/'                      &&

		   /* ptrName is a beginning (prefix) of currentName */
		   strncmp(ptrName, currentName, length) == 0    &&

		   /* currentName is LONGER than ptrName */
		   currentName[length] != '\0'                   &&

		   currentName[length] != '/'

		)  return TRUE;

		return FALSE;

	} else {
		/* current = "/aaa/bbb/ccc/ddd.html"
		   Then we must ignore

		   href    = "/aaa/bbb/"
		   href    = "/aaa/"
		   href    = "/"

		   But accept

		   href    = "/aaa/bbb/ccc/"
		   href    = "/aaa/bbb/xxx....."
		 */

		char savechar;

		last = strrchr(currentName, '/');
		savechar = last[1];
		last[1] = '\0';

		/* current = "/aaa/bbb/ccc/" */

		if(
		   *strlast(ptrName) == '/'                      &&
		   strncmp(ptrName, currentName, length) == 0    &&
		   currentName[length] != '\0'                   &&
		   currentName[length] != '/'
		){
			last[1] = savechar;
			return TRUE;
		}
		last[1] = savechar;
		return FALSE;
	}
}
E 1
