h40702
s 00021/00003/00266
d D 1.23 04/09/17 03:27:07 abogaty 23 22
c Human interface ^C
e
s 00001/00001/00268
d D 1.22 04/06/20 15:07:32 abogaty 22 21
c 
e
s 00002/00001/00267
d D 1.21 04/06/20 03:18:10 abogaty 21 20
c hops
e
s 00000/00043/00268
d D 1.20 04/06/20 02:34:52 abogaty 20 19
c DNS cache (gethostbyname)
e
s 00002/00002/00309
d D 1.19 03/12/08 01:02:25 abogaty 19 18
c 
e
s 00003/00000/00308
d D 1.18 03/08/10 16:29:43 abogaty 18 17
c 
e
s 00001/00001/00307
d D 1.17 03/07/27 01:16:34 abogaty 17 16
c 
e
s 00004/00000/00304
d D 1.16 03/07/27 00:53:17 abogaty 16 15
c SIGHUP killed the netread
e
s 00000/00000/00304
d D 1.15 03/05/02 20:19:59 abogaty 15 14
c 
e
s 00000/00000/00304
d D 1.14 03/05/02 02:41:29 abogaty 14 13
c 
e
s 00000/00000/00304
d D 1.13 03/04/22 17:54:33 abogaty 13 12
c 
e
s 00000/00000/00304
d D 1.12 03/04/12 01:12:05 abogaty 12 11
c reget_appendflag
e
s 00022/00002/00282
d D 1.11 03/02/04 21:41:50 abogaty 11 10
c 
e
s 00007/00004/00277
d D 1.10 02/09/18 02:00:43 abogaty 10 9
c -verifysizes
e
s 00000/00000/00281
d D 1.9 02/09/12 19:57:10 abogaty 9 8
c -texttoo flag to parse *.txt files as well
e
s 00003/00000/00278
d D 1.8 02/09/11 07:55:02 abogaty 8 7
c -relaxtime
e
s 00005/00005/00273
d D 1.7 02/09/11 04:47:08 abogaty 7 6
c use_gzip flag
e
s 00000/00000/00278
d D 1.6 02/09/03 21:28:38 abogaty 6 5
c -trim flag
e
s 00000/00000/00278
d D 1.5 02/06/20 03:42:19 abogaty 5 4
c nobreak_amp
e
s 00000/00000/00278
d D 1.4 02/06/08 03:30:38 abogaty 4 3
c Fix NWORDS => NJAVAWORDS
e
s 00000/00000/00278
d D 1.3 02/06/08 02:42:56 abogaty 3 2
c javascript:
e
s 00001/00001/00277
d D 1.2 02/05/25 04:30:46 abogaty 2 1
c no
e
s 00278/00000/00000
d D 1.1 02/05/21 20:01:34 abogaty 1 0
c date and time created 02/05/21 20:01:34 by abogaty
e
u
U
f e 0
t
T
I 1
#include "defs.h"

#include <signal.h>
#include <errno.h>

extern int errno;
I 11
extern char ContentLength[];
E 11

char HTTPPROTO[] = "HTTP/1.0";

void initNet(){
	struct servent *sp;

	if((sp = getservbyname("http", "tcp")) != NULL) /* returns net order */
		HTTPPORT = sp->s_port;
	else    HTTPPORT = htons(80);

	if(basePort != 0)
		HTTPPORT = htons(basePort);
}

int TIMEOUT = 60 * 15;     /* 15 minutes */
Bool timed_out = FALSE;

static void onAlarm(int nsig){
	timed_out = TRUE;

D 23
	fprintf(stderr, "ERR timed out after %d seconds\n", TIMEOUT);
	fprintf(fplog,  "ERR timed out after %d seconds\n", TIMEOUT);
E 23
I 23
	fprintf(stderr, "@ERROR timed out after %d seconds\n", TIMEOUT);
	fprintf(fplog,  "@ERROR timed out after %d seconds\n", TIMEOUT);
E 23
}

int callHost(List *list, URL *urlptr){
	struct sockaddr_in outAddr;
	unsigned long addr;
	struct hostent *hp;
	int sock, tmpfd;
	int nread;
	int on = 1;
	size_t sent = 0;
	int code = 0;
	char message[4096];
I 11
	char *method = NULL;
E 11
	void (*savesig)();
I 23
	void (*swr)();
E 23

	char *hostname = urlptr->hostName;
	char *urlname  = urlptr->urlName;
	u_short port   = urlptr->port;

	if(proxyflag){
		hostname = proxy_host;
		port     = proxy_port;
		if(debugflag > 1) fprintf(stderr, "Proxy %s:%d\n", hostname, ntohs(port));
	}

	urlptr->trys++;         /* number of a trial */

	memset((char *) &outAddr, 0, sizeof(outAddr));

	outAddr.sin_family = AF_INET;
	outAddr.sin_port   = port;     /* net order */

	if((addr = inet_addr(hostname)) == -1){
D 19
		if((hp = gethostbyname(hostname)) == NULL){
E 19
I 19
		if((hp = MY_gethostbyname(hostname)) == NULL){
E 19
			fprintf(fplog, "Unknown host: %s\n", hostname);
			return UNKNOWN;

		} else saveAddrList(urlptr, hp);

		memcpy((char *) &outAddr.sin_addr, (hp->h_addr_list)[0], hp->h_length);
	} else {
		outAddr.sin_addr.s_addr = addr;
	}
	if(debugflag)
		fprintf(fplog, "\tUSEADDR: %s\n", inet_ntoa(outAddr.sin_addr));

	if((sock = socket(outAddr.sin_family, SOCK_STREAM, 0)) < 0){
		myperror("socket");
		return FAILED;
	}
	if(keepalive){
		on = 1; setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *) &on, sizeof on);
	}
	if(connect(sock, &outAddr, sizeof(outAddr)) < 0){
		myperror("connect");
		close(sock);
		return (FAILED|RETRY);
	}

	if(debugflag > 1) fprintf(stderr, "Connected to %s\n", hostname);

	if((tmpfd = creat(tmpname, 0644)) < 0){
		myperror("creat");
		close(sock);
		return FAILED;
	}
	/* ----------------------------------------------------------- */
	code = CONNECTED;

	/* ----------- HTTP protocol --------------------------------- */
I 11
	if(urlptr->flags & USEPOST)
		method = "POST";
	else if (headOnly)
		method = "HEAD";
	else
		method = "GET";

E 11
	if(proxyflag){
D 10
		sprintf(message, "GET http://%s:%d%s %s\r\n", 
E 10
I 10
		sprintf(message, "%s http://%s:%d%s %s\r\n", 
D 11
			headOnly ? "HEAD" : "GET",
E 11
I 11
			method,
E 11
E 10
			urlptr->hostName,
			ntohs(urlptr->port),
			urlptr->urlName,
			HTTPPROTO);
		write(sock, message, strlen(message));
		if(debugflag > 2) fprintf(stderr, "\t%s", message);

		sprintf(message, "Host: %s:%d\r\n",
			urlptr->hostName,
			ntohs(urlptr->port)
		);
		write(sock, message, strlen(message));
		if(debugflag > 2) fprintf(stderr, "\t%s", message);

	} else {
D 10
		sprintf(message, "GET %s %s\r\n", urlname, HTTPPROTO);
E 10
I 10
		sprintf(message, "%s %s %s\r\n",
D 11
			headOnly ? "HEAD" : "GET",
E 11
I 11
			method,
E 11
			urlname, HTTPPROTO);
E 10
		write(sock, message, strlen(message));
		if(debugflag > 2) fprintf(stderr, "\t%s", message);

		if(hostflag){
			sprintf(message, "Host: %s\r\n", hostname);
			write(sock, message, strlen(message));
			if(debugflag > 2) fprintf(stderr, "\t%s", message);
		}
	}
	if(urlptr->parentURL && urlptr->parentURL->fullName){
		sprintf(message, "Referer: %s\r\n", urlptr->parentURL->fullName);
		write(sock, message, strlen(message));
		if(debugflag > 2) fprintf(stderr, "\t%s", message);
	}

	sprintf(message, "User-Agent: Mozilla/4.75 [en] (X11; U; Linux 2.2.16.22 i686)\r\n");
	write(sock, message, strlen(message));
	if(debugflag > 2) fprintf(stderr, "\t%s", message);

#if 0
	sprintf(message, "Proxy-Connection: Keep-Alive\r\n");
	write(sock, message, strlen(message));
	if(debugflag > 2) fprintf(stderr, "\t%s", message);
#endif

	sprintf(message, "Pragma: no-cache\r\n");
	write(sock, message, strlen(message));
	if(debugflag > 2) fprintf(stderr, "\t%s", message);

	sprintf(message, "Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, image/png, */*\r\n");
	write(sock, message, strlen(message));
	if(debugflag > 2) fprintf(stderr, "\t%s", message);

D 7
/*
	sprintf(message, "Accept-Encoding: gzip\r\n");
	write(sock, message, strlen(message));
	if(debugflag > 2) fprintf(stderr, "\t%s", message);
*/
E 7
I 7
	if(use_gzip){
		sprintf(message, "Accept-Encoding: gzip\r\n");
		write(sock, message, strlen(message));
		if(debugflag > 2) fprintf(stderr, "\t%s", message);
	}
E 7

	if(authstring && *authstring){
		sprintf(message, "%s\r\n", authstring);
		write(sock, message, strlen(message));
		if(debugflag > 2) fprintf(stderr, "\t%s", message);
	}
I 11
	if(urlptr->flags & USEPOST){
		sprintf(message, "%s %d\r\n", ContentLength, strlen(urlptr->post_data));
		write(sock, message, strlen(message));
		if(debugflag > 2) fprintf(stderr, "\t%s", message);
	}
E 11

	sprintf(message, "\r\n");
	write(sock, message, strlen(message));
	if(debugflag > 2) fprintf(stderr, "\t%s", message);

I 11
	if(urlptr->flags & USEPOST){
		sprintf(message, "%s\r\n", urlptr->post_data);
		write(sock, message, strlen(message));
		if(debugflag > 2) fprintf(stderr, "\t%s", message);
	}

E 11
	if(debugflag > 1) fprintf(stderr, "Sent request for %s\n", urlname);

	if(debugflag){
		int toretry;
		int rest = countRemaining(list, &toretry);

D 21
		fprintf(stderr, "[%d.%d:%d] %s %s %s:%u%s\n[%d URLs left, %d to retry]\n",
E 21
I 21
		fprintf(stderr, "[%d.%d:%d/%d] %s %s %s:%u%s\n[%d URLs left, %d to retry]\n",
E 21
			urlptr->serial,
			urlptr->level,
D 22
			urlptr->trys,
E 22
I 21
			urlptr->hops,
I 22
			urlptr->trys,
E 22
E 21
			currentDate(),

			urlptr->trys == 1 ? "Contacting" : "Retrying",

			hostname,
			ntohs(port),
			urlname,
			rest,
			toretry
		);
		if(proxyflag)
D 2
		fprintf(stderr, "GET http://%s:%d%s\n",
E 2
I 2
		fprintf(stderr, "[http://%s:%d%s]\n",
E 2
			urlptr->hostName,
			ntohs(urlptr->port),
			urlptr->urlName);
	}

	for(;;){

		errno = 0;
		timed_out = FALSE;
		savesig = sigset(SIGALRM, onAlarm);
		alarm(TIMEOUT);

		nread = read(sock, message, sizeof(message));

		alarm(0);

I 23
		if(skip_this){
			if(skip_this == 2)
				code |= (FAILED|RETRY);
			else	code |= (FAILED|IGNORED);
			myperror("file download abandoned");
			errno = 0;
			break;
		}
E 23
		if(nread <= 0 && timed_out == TRUE){
			errno = ETIMEDOUT;
			nread = (-1);
			break;
		}
I 16
D 17
		if(nread <= 0 && errno == EINTR){
E 17
I 17
		if(nread <= 0 && errno == EINTR && restart_intr){
E 17
			myperror("net read interrupted");
			continue;
		}
E 16
		if(nread <= 0)
			break;

I 23
		swr = sigset(SIGINT, SIG_IGN);
E 23
		if(write(tmpfd, message, nread) != nread){
I 23
			sigset(SIGINT, swr);
E 23
D 10
			fprintf(stderr, "@@@ FATAL ERROR writing /tmp file: %s\n", strerror(errno));
			fprintf(fplog,  "@@@ FATAL ERROR writing /tmp file: %s\n", strerror(errno));
E 10
I 10
			fprintf(stderr, "@@@ FATAL ERROR writing tmp file: %s\n", strerror(errno));
			fprintf(fplog,  "@@@ FATAL ERROR writing tmp file: %s\n", strerror(errno));
I 18
			myperror("tmp write");
			code |= (FAILED|RETRY);
			break;
E 18
E 10
		}
I 23
		sigset(SIGINT, swr);
E 23
		sent += nread;
		if(debugflag)
		       fprintf(stderr, "Sent: %lu\r", sent);
	}
	sigset(SIGALRM, savesig);

	if(nread < 0 && errno != 0){
		myperror("net read");
		code |= (FAILED|RETRY);
	}
	if(debugflag)
	       fprintf(stderr, "Sent: %lu\n", sent);

	close(tmpfd);
	close(sock);

I 8
	if(relaxtime)
		sleep(relaxtime);

E 8
D 23
	processDocument(list, urlptr, tmpname);
E 23
I 23
	if(skip_this){
		fprintf(stderr, "@@@ DOCUMENT SKIPPED\n");
		fprintf(fplog,  "@@@ DOCUMENT SKIPPED\n");
	}
	else
		processDocument(list, urlptr, tmpname);
E 23
	unlink(tmpname);
I 23
	skip_this = 0;
E 23

	return code;
}
D 20

struct in_addr *saveAddresses(struct hostent *hp){
	int count, i;
	char **p;
	struct in_addr *vectptr;

	count = lengthVector(hp->h_addr_list);

	vectptr = (struct in_addr *) calloc(sizeof(struct in_addr), count + 1);

	for (p = hp->h_addr_list, i=0; *p != 0; p++, i++){
		(void) memcpy(&vectptr[i].s_addr, *p, sizeof(struct in_addr));
#ifdef DEBUG
		fprintf(fplog, "\tADDR: %s\n", inet_ntoa(vectptr[i]));
#endif
	}
	return vectptr;
}

void saveAddrList(URL *ptr, struct hostent *hp){
	if(ptr->address_list == NULL)
	   ptr->address_list = saveAddresses(hp);
}

Bool oneOfAddresses(char *hostname, struct in_addr *addresses){
	struct hostent *hp;
	struct in_addr hostaddr, *addrptr;
	char **p;

D 19
	if((hp = gethostbyname(hostname)) == NULL)
E 19
I 19
	if((hp = MY_gethostbyname(hostname)) == NULL)
E 19
		return FALSE;

	memcpy((char *) &hostaddr.s_addr, (hp->h_addr_list)[0], sizeof hostaddr.s_addr);

	for (p = (char **) addresses; *p != 0; p++){
		addrptr = (struct in_addr *) p;

		if(addrptr->s_addr == hostaddr.s_addr)
			return TRUE;

	}
	return FALSE;
}
E 20
E 1
