/***************************************************************************\
 *  webgrab - v1.3 - Copyright 1995, Brian J. Swetland                     *
 *                                                                         *
 *  - initial version by Brian Swetland                                    *
 *  - cleaned up a bit by Brandon Long                                     *
 *  - proxy support by Kristin Buxton                                      *
 *  - cleaned up more by Brian Swetland                                    *
 *                                                                         *
 *  bcl version modified by Brandon Long                                   *
 *  - support the WWW_PROXY env var                                        *
 *  - support for non http:// urls through a proxy                         *
 *  - support for not having to use a protocol, assume http:               *
 *                                                                         *
 *  Free for any personal or non-commercial use.                           *
 *  Use at your own risk.  If you like it, buy the authors a pizza.        *
\***************************************************************************/

#define VERSION "1.3bcl"

#include <stdio.h>
#include <fcntl.h>

#ifdef __bsdi__
# include <sys/malloc.h>
#else
# ifndef NeXT
#  include <malloc.h>
# endif
#endif

#include <sys/time.h>
#include <sys/types.h>

#include <sys/socket.h>
#include <netinet/in.h>
#include <unistd.h>
#include <netdb.h>

#include <string.h>

/* strdup isn't portable, so we make our own.  */
char *strd(char *s) {
    char *d;
 
    d = (char *) malloc(strlen(s) + 1);
    strcpy(d,s);
    return(d);
}

/* parses URL looking like blah://host[:port][/path]
   will ignore anything before the first : and terminate path when it
   hits >, ", or whitespace -- returns portno or 0 if bad url */

int parseURL(char *url, char **protocal, char **host, char **path)
{
    char *p, *pp;
    int port;
    
    p = url;  
    
        /* skip anything up to the first : (the one after http, etc) */
    while(*p && *p!=':') p++;
    if(!*p) {
        p = url; /* assume http:// */
	*protocal = "http";
    } else {
       /* REQUIRE two '/'s */
       if(!(*(++p) && (*p =='/') && *(++p) && (*p == '/')))
         return 0;
       *protocal = (char *) malloc(p - url);
       strncpy(*protocal,url,p-url-2);
       (*protocal)[p-url-1]='\0';
       p++;
    } 
    
        /* mark the beginning of the hostname */
    pp = p;
        /* hostname is terminated by a '/' or '>','"',or whitespace */
    while(*p && *p!=':' && *p!='/' && *p!='"' && *p!='>' && !isspace(*p)) 
        p++;
    
    *host = (char *) malloc(p-pp+1);
    strncpy(*host,pp,p-pp);
    (*host)[p-pp]='\0';
    
        /* optionally read a portnumber */
    if(*p==':'){
        p++;
        port = 0;
        while(*p && isdigit(*p)){
            port = port*10 + (*p-'0');
            p++;
        }
        if(!*p || *p!='/') {
            free(*host);
            return 0;
        }
    } else {
        port = 80;
    }
    
        /* still more */
    if(*p && (*p=='/')){
        pp = p;
        while(*p && *p!='"' && *p!='>' && !isspace(*p)) p++;
        *p = 0;
        *path = strd(pp);
    } else {
        *path = strd("/");
    }
    return port;
}

void usage(char *argv) 
{
    printf("\nWebgrab: The Command Line Browser\tVersion %s \n",VERSION);
    printf("Usage: %s [-shrpd] [<proxy>] <url>\n",argv);
    printf("   -s      Suppress Headers\n");
    printf("   -h      Headers Only\n");
    printf("   -r      Read HTTP headers from stdin\n");
    printf("   -d      Do nothing\n");
    printf("   -p      Next argument is <proxy>\n");
    printf("   <proxy> HTTP Proxy Host ( hostname[:port] format )\n");
    printf("   <url>   URL to retrieve (in http:// format)\n\n");
    exit(1);
}

int main(int argc, char *argv[])
{
    int s, i, port, pport;
    struct sockaddr_in sa;
    struct hostent *hp;
    FILE *fpo,*fpi;
    char buf[1024];
    char *path,*host,*p;
    char *protocal = NULL;
    char *proxy = NULL;
    char *url = NULL;
    
    /* operational flags */
    int ignore=0,head=0,readin=0,proxynext=0,urlmunge=0,verbose=0;

    proxy = getenv("WWW_PROXY");

    for(i = 1; i < argc; i++){
        if(proxynext){
                /* this arg is our proxy */
            proxy = argv[i];
            proxynext = 0;
            continue;
        }
        if(argv[i][0]=='-'){
            for(path=&argv[1][1];*path;path++){
                switch(*path){
                case 'r':
                    readin = 1;
                    break;
                case 's':
                    ignore = 1;
                    break;
                case 'h':
                    head = 1;
                    break;
                case 'v':
		    verbose = 1;
		    break;
                case 'p':
                    if(proxy || proxynext) usage(argv[0]);
                    proxynext = 1;
                    break;
                default:
                    usage(argv[0]);
                }
            }
            continue;
        }
            /* must be a url */
        if(url) usage(argv[0]);
        url = argv[i];
    }

    if(proxynext || !url) usage(argv[0]);
    
    
  /* find the server */
    if(proxy){
        pport = 80;
        p = proxy;

        /* look for a portnum */
        while(*p){
            if(*p==':'){
                *p=0;
                p++;
                pport = atoi(p);
                break;
            }
            p++;
        }
        if(!(hp = gethostbyname(proxy))) {
            fprintf(stderr,"error: can't get proxy %s.\n",proxy);
            exit(1);
        }
	if (!(port=parseURL(url, &protocal, &host, &path))) {
	  fprintf(stderr,"%s: url parse failed, trying proxy anyways\n",argv[0]);
	  urlmunge = 1;
        }
    } else {
        if (!(port=parseURL(url, &protocal, &host, &path))) {
          fprintf(stderr,"error: invalid url\n");
          exit(1);
        }
	if (strcmp(protocal,"http")) {
	  fprintf(stderr,"%s: error: webgrab only supports http without a proxy\n",argv[0]);
	  exit(1);
        }

        if(!(hp = gethostbyname(host))) {
            fprintf(stderr,"error: can't get host %s.\n",host);
            exit(1);
        }
    }

    if (verbose) {
      fprintf(stderr, "Webgrab: The Command Line Browser\tVersion %s \n",VERSION);
      if (proxy)
	fprintf(stderr,"Proxy: %s:%d\n",proxy,pport);
      fprintf(stderr, "Request: %s ",head?"HEAD":"GET");
      if (urlmunge)
	fprintf(stderr,"%s ",url);
       else
        fprintf(stderr,"%s://%s:%d%s ",protocal,host,port,path);
      fprintf(stderr,"HTTP/1.0\n");
      fprintf(stderr,"\n");
    }
  
    /* Setup the socket */
    memset(&sa, 0, sizeof(sa));  
    sa.sin_port = htons(proxy ? pport : port);
    memcpy((char *)&sa.sin_addr, (char *)hp->h_addr, hp->h_length);
    sa.sin_family = hp->h_addrtype;
  
    /* allocate the socket */
    if((s = socket(hp->h_addrtype, SOCK_STREAM, 0)) < 0){
        fprintf(stderr,"error: can't get socket\n");
        exit(1);
    }
    
        /* connect to the server */
    if(connect(s, &sa, sizeof(sa)) < 0){
        close(s);
        fprintf(stderr,"error: can't connect\n");
        exit(1);
    }
    
    fpo = fdopen(s,"w");
    fpi = fdopen(s,"r");
    if(proxy){
       if (urlmunge)
         fprintf(fpo,"%s %s HTTP/1.0\r\n",head?"HEAD":"GET",url);
        else
	 fprintf(fpo,"%s %s://%s:%d%s HTTP/1.0\r\n",head?"HEAD":"GET",
		 protocal,host,port,path);
    } else {
        fprintf(fpo,"%s %s HTTP/1.0\r\n",head?"HEAD":"GET",path);
    }
    
    if (readin) {
            /* copy headers from stdin ... */
        while(!feof(stdin)){
            i = fread(buf,1,1024,stdin);
            if(i) fwrite(buf,1,i,fpo);
            if(feof(stdin)) break;
        } 
    } else {
            /* send our normal header info */
        fprintf(fpo,
                "User-Agent: WebGrab/%s (commandline forever)\r\n",
                VERSION);
    }      
    fputs("\r\n",fpo);
    fflush(fpo);
    
        /* handle headers */
    while(!feof(fpi)){
        fgets(buf,1024,fpi);
        if(!ignore) fprintf(stdout,"%s",buf);
        if(feof(fpi) || buf[0]<' ') break;
    }
    while(!feof(fpi)){
        i = fread(buf,1,1024,fpi);
        if(i) fwrite(buf,1,i,stdout);
        if(feof(fpi)) break;
    }
    close(s);
    exit(0);
}
