Article 1597 of comp.infosystems.www:
Xref: feenix.metronet.com comp.infosystems.www:1597
Path: feenix.metronet.com!news.ecn.bgu.edu!usenet.ins.cwru.edu!howland.reston.ans.net!math.ohio-state.edu!cs.utexas.edu!geraldo.cc.utexas.edu!ansel.cc.utexas.edu!not-for-mail
From: zippy@ansel.cc.utexas.edu (Jack Lund)
Newsgroups: comp.infosystems.www
Subject: Re: How to get gopher files with perl?
Date: 14 Sep 1993 13:42:50 -0500
Organization: The University of Texas - Austin
Lines: 711
Message-ID: <2753ba$a2m@ansel.cc.utexas.edu>
References: <1993Sep14.163827.7182@news.unige.ch>
NNTP-Posting-Host: ansel.cc.utexas.edu

In article <1993Sep14.163827.7182@news.unige.ch>,
Oscar Nierstrasz <oscar@cui.unige.ch> wrote:
>
>I want to get a large number of gopher files using a shell/perl script
>(actually I don't want to -- Simon Gibbs next door wants to).
>
>The gopher protocol looked simple, so I thought it would be easy.
>It seems to work, but the files returned are short if they are image files (GIF).
>What am I doing wrong?  Are there any good pointers to on-line doc?
>(I have seen the gopher protocol.rtf at cern, but it seems to be
>somewhat out of date ...)
>
>I thought all you had to do was connect, send the name of the file you
>want, and gobble up the reply.  That doesn't seem to work.
>Are image files compressed or otherwise encoded?
>
>Here is the perl subroutine I am using:

I modified a perl script of yours (remember hget?) to take (just
about) any URL and grab the appropriate document. It works well for
binary & ascii files. You might check it out.

I haven't had a chance to take a close look at your code, but if I
discover anything, I'll let you know.

-------------------------url_get------------------------------------
#!/bin/perl
#
# url_get      --- get a document given a WWW URL
#
# Jack Lund 9/3/93 zippy@ccwf.cc.utexas.edu
#
# from hget by:
# Oscar Nierstrasz 26/8/93 oscar@cui.unige.ch
#

$home = $ENV{"HOME"};

require "chat2.pl";
require "$home/lib/perl/URL.pl";
require "$home/lib/perl/ftplib.pl";
require "getopts.pl";
&Getopts(':b');

die "Usage: url_get <http-url> ...\n" unless $#ARGV >= 0;
$timeout = undef;

foreach $url (@ARGV) {
        ($protocol, $host, $port, $rest1, $rest2, $rest3) = &url'parse_url($url);
        foo:
    {
        if ($protocol eq "http") {
	    &http_get($host,$port,$rest1); last foo;
        }
        if ($protocol eq "gopher") {
	    &gopher_get($host, $port, $rest1, $rest2, $rest3); last foo;
        }
	if ($protocol eq "file") {
	    &file_get($host, $port, $rest1); last foo;
	}
	if ($protocol eq "news") {
	    &news_get($host, $port, $rest1); last foo;
	}
        die "Protocol $protocol not supported!\n";
    }
}

sub http_get {
    local($host,$port,$request) = @_;
    ($handle = &chat'open_port($host, $port))
        || die "chat'open($host,$port): $!\n";
    &chat'print($handle,"GET $request\n")
        || die "chat'print(GET $request): $!\n";
    *S = *chat'S;
	while (<S>) {
		print "$_";
	}
    &chat'close($handle);
}

sub gopher_get {
    local($host,$port,$gtype,$selector,$search) = @_;
    $request = ($search ? "$selector\t$search" : $selector);
    ($handle = &chat'open_port($host, $port))
        || die "chat'open($host,$port): $!\n";
    &chat'print($handle,"$request\n")
        || die "chat'print($request): $!\n";
	*S = *chat'S;

# If this is a binary document, retreive it using sysreads rather
# than <S>

    if ($gtype eq '5' || $gtype eq '9' || $gtype eq 'I') {
	    $done = 0;
	    $rmask = "";
		vec($rmask,fileno(S),1) = 1;
		do {
			($nfound, $rmask) =
		 		select($rmask, undef, undef, $timeout);
			if ($nfound) {
				$nread = sysread(S, $thisbuf, 1024);
				if ($nread > 0) {
					syswrite(STDOUT, $thisbuf, $nread)
                        || die "Syswrite: $!\n";
				} else {
					$done++;
				}
			} else {
				warn "Timeout\n"; $done++;
			}
		} until $done;
	}

# This is an ASCII document, and we can get it line-by-line using <S>

	else {
		while (<S>) {
			last if (/^\.\r\n$/);
			chop; chop;
			print "$_\n";
		}
	}
	&chat'close($handle);
}

sub file_get {
    local($host, $port, $path) = @_;

    $localhost = `hostname`;
    if ($host eq $localhost && !defined($port)) {
	open(IN, $path) || die "$path: $!\n";
	while (<IN>) {
	    print "$_\n";
	}
	close(IN);
    }
    else {
	&ftp'open($host) || die "Unable to open ftp connection to $host\n";
	&ftp'get($path, "&STDOUT")
	    || die "Unable to get file $path from $host\n";
	&ftp'close;
    }
}

sub news_get {
    local($host, $port, $article) = @_;

    ($handle = &chat'open_port($host, $port))
        || die "chat'open($host,$port): $!\n";

    if ($article =~ /^[^<].+@.+[^>]$/) {
	$request = "article <$article>";
    }
    elsif ($article =~ /^<.+@.+>$/) {
	$request = "article $article";
    }
    elsif ($article =~ /^\*$/) {
	die "Only support URLs of the form: news:article\n";
    }
    elsif ($article) {
	die "Only support URLs of the form: news:article\n";
    }
    else {
	die "Bad url\n";
    }

# Read NNTP Connect message

    *S = *chat'S;
    $string = <S>;
    $string =~ /^(\d*) (.*)$/;
    die "NNTP Error: $2\n" unless ($1 eq '200');

# Send request

    &chat'print($handle,"$request\r\n")
        || die "chat'print($request): $!\n";

# Read reply message

    $string = <S>;
    $string =~ /^(\d*) (.*)$/;
    die "NNTP Error: $2\n" unless ($1 eq '220');

# Get article

    while (<S>) {
	last if (/^\.\r\n$/);
	chop; chop;
	print "$_\n";
    }
    &chat'print($handle,"quit\n")
        || die "chat'print(quit): $!\n";
    &chat'close($handle);
}


__END__

-------------------------URL.pl---------------------------------------
#
# URL.pl - package to parse WWW URLs
#
# Jack Lund 9/3/93 zippy@ccwf.cc.utexas.edu
#

package url;

# Default port numbers for URL services

$ftp_port = 21;
$http_port = 80;
$gopher_port = 70;
$telnet_port = 23;
$wais_port = 210;
$news_port = 119;

# syntax: &url'parse_url(URL)
# returns array containing following:
# 	protocol	protocol string from url. ex: "gopher", "http".
#	host		host that specified protocol server is running on
#	port		port that server answers on
# the rest of the array is protocol-dependant. See code for details.
#

sub parse_url {
    local($url) = @_;

    if ($url =~ m#^(\w+):#) {
	$1 =~ s/[A-Z]/[a-z]/g;
	$protocol = $1;
    } else {
	return undef;
    }

    if ($protocol eq "file" || $protocol eq "ftp") {

# URL of type: file://hostname[:port]/path

	if ($url =~ m#^\s*\w+://([^ \t/:]+):?(\d*)(/.*)$#) {
	    $1 =~ s/[A-Z]/[a-z]/;
	    $host = $1;
	    $port = ($2 ne "" ? $2 : $ftp_port);
	    $path = $3;
	    return ($protocol, $host, $port, $path);
	}

# URL of type: file:/path

	if ($url =~ m#^\s*\w+:(/.*)$#) {
	    $host = `hostname`;  # Current host
	    $port = undef;
	    return ($protocol, $host, $port, $1);
	}
	return undef;
    }

    if ($protocol eq "news") {

# URL of type: news://host[:port]/article

	if ($url =~ m#^\s*\w+://([^ \t:/]):?(\d*)/(.*)$#) {
	    $host = $1;
	    $port = ($2 ne "" ? $2 : $news_port);
	    $selector = $3;
	}

# URL of type: news:article

	elsif ($url =~ m#^\s*\w+:(.*)$#) {
	    $host = $ENV{"NNTPSERVER"};
	    unless ($host) {
		warn "Couldn't get NNTP server name\n";
		return undef;
	    }
	    $port = $news_port;
	    $selector = $1;
	}
	else {
	    return undef;
	}
	return ($protocol, $host, $port, $selector);
    }

# URL of type: http://host[:port]/path[?search-string]

    if ($protocol eq "http") {
	if ($url =~ m#^\s*\w+://([\w\d\.]+):?(\d*)(/[^ \t\?]+)\??(.)*$#) {
	    $1 =~ s/[A-Z]/[a-z]/g;
	    $server = $1;
	    $port = ($2 ne "" ? $2 : $http_port);
	    $path = $3;
	    $search = $4;
	    return ($protocol, $server, $port, $path, $search);
	}
	return undef;
    }

# URL of type: telnet://user@host[:port]

    if ($protocol eq "telnet") {
	if ($url =~ m#^\s*\w+://([^@]+)@([^: \t]+):?(\d*)$#) {
	    $user = $1;
	    $2 =~ s/[A-Z]/[a-z]/g;
	    $host = $2;
	    $port = (defined($3) ? $3 : $telnet_port);
	    return($protocol, $host, $port, $user);
	}

# URL of type: telnet://host[:port]

	if ($url =~ m#^\s*\w+://([^: \t]+):?(\d*)$#) {
	    $1 =~ s/[A-Z]/[a-z]/g;
	    $host = $1;
	    $port = (defined($2) ? $2 : $telnet_port);
	    return($protocol, $host, $port);
	}
	return undef;
    }

# URL of type: gopher://host[:port]/[gtype]selector-string[?search-string]

    if ($protocol eq "gopher") {
	if ($url =~ m#^\s*\w+://([\w\d\.]+):?(\d*)/(\w?)([^ \t\?]*)\??(.*)$#) {
	    $1 =~ s/[A-Z]/[a-z]/g;
	    $server = $1;
	    $port = ($2 ne "" ? $2 : $gopher_port);
	    $gtype = ($3 ne "" ? $3 : 1);
	    $selector = $4;
	    $search = $5;
	    return ($protocol, $server, $port, $gtype, $selector, $search);
	}
	return undef;
    }

# URL of type: wais://host[:port]/database?search-string

    if ($protocol eq "wais") {
	if ($url =~ m#^\s\w+://([\w\d\.]+):?(\d*)/([^\?]+)\??(.*)$#) {
	    $1 =~ s/[A-Z]/[a-z]/g;
	    $server = $1;
	    $port = (defined($2) ? $2 : $wais_port);
	    $database = $3;
	    $search = $4;
	    return ($protocol, $server, $port, $database, $search);
	}
	return undef;
    }
}

-------------------------ftplib.pl---------------------------------------
#
#   This is a set of ftp library routines using chat2.pl
# 
#   Return code information taken from RFC 959

#   Written by Gene Spafford  <spaf@cs.purdue.edu>
#       Last update: 10 April 92,   Version 0.9
#

#
#   Most of these routines communicate over an open ftp channel
#   The channel is opened with the "ftp'open" call.
#

package ftp;
require "chat2.pl";
require "syscall.ph";


###########################################################################
#
#  The following are the variables local to this package.
#  I declare them all up front so I can remember what I called 'em. :-)
#
###########################################################################

LOCAL_VARS: {	
    $Control;
    $Data_handle;
    $Host;
    $Myhost = "\0" x 65;
    (syscall(&SYS_gethostname, $Myhost, 65) == 0) || 
	die "Cannot 'gethostname' of local machine (in ftplib)\n";
    $Myhost =~ s/\0*$//;
    $NeedsCleanup;
    $NeedsClose;
    $ftp_error;
    $ftp_matched;
    $ftp_trans_flag;
    @ftp_list;

    local(@tmp) = getservbyname("ftp", "tcp");
    ($FTP = $tmp[2]) || 
	die "Unable to get service number for 'ftp' (in ftplib)!\n";

    @std_actions = (
	    'TIMEOUT',
	    q($ftp_error = "Connection timed out for $Host!\n"; undef),
	    'EOF', 
	    q($ftp_error = "Connection to $Host timed out unexpectedly!\n"; undef)
    );

    @sigs = ('INT', 'HUP', 'TERM', 'QUIT');  # sigs we'll catch & terminate on
}



###########################################################################
#
#  The following are intended to be the user-callable routines.
#  Each of these does one of the ftp keyword functions.
#
###########################################################################

sub error { ## Public
    $ftp_error;
}
  
#######################################################

#   cd up a directory level

sub cdup { ## Public
    &do_ftp_cmd(200, "cdup");
}

#######################################################

# close an open ftp connection

sub close { ## Public
    return unless $NeedsClose;
    &do_ftp_cmd(221, "quit");
    &chat'close($Control);
    undef $NeedsClose;
    &do_ftp_signals(0);
}

#######################################################

# change remote directory

sub cwd { ## Public
    &do_ftp_cmd(250, "cwd", @_);
}
  
#######################################################

#  delete a remote file

sub delete { ## Public
     &do_ftp_cmd(250, "dele", @_); 
}

#######################################################

#  get a directory listing of remote directory ("ls -l")

sub dir { ## Public
    &do_ftp_listing("list", @_);
}

#######################################################

#  get a remote file to a local file
#    get(remote[, local])

sub get { ## Public
    local($remote, $local) = @_;
    ($local = $remote) unless $local;

    unless (open(DFILE, ">$local")) {
	$ftp_error =  "Open of local file $local failed: $!";
	return undef;
    } else {
	$NeedsCleanup = $local;
    }

    return undef unless &do_open_dport; 	# Open a data channel
    unless (&do_ftp_cmd(150, "retr $remote")) {
	$ftp_error .= "\nFile $remote not fetched from $Host\n";
	close DFILE;
	unlink $local;
	undef $NeedsCleanup;
	return;
    }

    $ftp_trans_flag = 0;

    do {
	&chat'expect($Data_handle, 60,
		     '.|\n', q{print DFILE ($chat'thisbuf) ||
			($ftp_trans_flag = 3); undef $chat'S},
		     'EOF',  '$ftp_trans_flag = 1',
		     'TIMEOUT', '$ftp_trans_flag = 2');
    } until $ftp_trans_flag;

    close DFILE;
    &chat'close($Data_handle);		# Close the data channel

    undef $NeedsCleanup;
    if ($ftp_trans_flag > 1) {
	unlink $local;
	$ftp_error = "Unexpected " . ($ftp_trans_flag == 2 ? "timeout" :
		($ftp_trans_flag != 3 ? "failure" : "local write failure")) .
                " getting $remote\n";
    }
    
    &do_ftp_cmd(226);
}

#######################################################

#  Do a simple name list ("ls")

sub list { ## Public
    &do_ftp_listing("nlst", @_);
}

#######################################################

#   Make a remote directory

sub mkdir { ## Public
    &do_ftp_cmd(257, "mkd", @_);
}

#######################################################

#  Open an ftp connection to remote host

sub open {  ## Public
    if ($NeedsClose) {
	$ftp_error = "Connection still open to $Host!";
	return undef;
    }

    $Host = shift(@_);
    local($User, $Password, $Acct) = @_;
    $User = "anonymous" unless $User;
    $Password = "-" . $main'ENV{'USER'} . "@$Myhost" unless $Password;
    $ftp_error = '';

    unless($Control = &chat'open_port($Host, $FTP)) {
	$ftp_error = "Unable to connect to $Host ftp port: $!";
	return undef;
    }

    unless(&chat'expect($Control, 60,
		        "^220 .*\n",	 "1",
		        "^\d\d\d .*\n",  "undef")) {
	$ftp_error = "Error establishing control connection to $Host";
        &chat'close($Control);
	return undef;
    }
    &do_ftp_signals($NeedsClose = 1);

    unless (&do_ftp_cmd(331, "user $User")) {
	$ftp_error .= "\nUser command failed establishing connection to $Host";
	return undef;
    }

    unless (&do_ftp_cmd("(230|332|202)", "pass $Password")) {
	$ftp_error .= "\nPassword command failed establishing connection to $Host";
	return undef;
    }

    return 1 unless $Acct;

    unless (&do_ftp_cmd("(230|202)", "pass $Password")) {
	$ftp_error .= "\nAcct command failed establishing connection to $Host";
	return undef;
    }
    1;
}

#######################################################

#  Get name of current remote directory

sub pwd { ## Public
    if (&do_ftp_cmd(257, "pwd")) {
	$ftp_matched =~ m/^257 (.+)\r?\n/;
	$1;
    } else {
	undef;
    }    
}

#######################################################

#  Rename a remote file

sub rename { ## Public
    local($from, $to) = @_;

    &do_ftp_cmd(350, "rnfr $from") && &do_ftp_cmd(250, "rnto $to");
}

#######################################################

#  Set transfer type

sub type { ## Public
    &do_ftp_cmd(200, "type", @_); 
}


###########################################################################
#
#  The following are intended to be utility routines used only locally.
#  Users should not call these directly.
#
###########################################################################

sub do_ftp_cmd {  ## Private
    local($okay, @commands, $val) = @_;

    $commands[0] && 
	&chat'print($Control, join(" ", @commands), "\r\n");

    &chat'expect($Control, 60, 
		 "^$okay .*\\n",        '$ftp_matched = $&; 1',
		 '^(\d)\d\d .*\\n', '($String = $&) =~ y/\r\n//d; 
		     $ftp_error = qq{Unexpected reply for ' .
		     "@commands" . ': $String}; 
		     $1 > 3 ? undef : 1',
		 @std_actions
		);
}

#######################################################

sub do_ftp_listing { ## Private
    local(@lcmd) = @_;
    @ftp_list = ();
    $ftp_trans_flag = 0;

    return undef unless &do_open_dport;

    return undef unless &do_ftp_cmd(150, @lcmd);
    do {			#  Following is grotty, but chat2 makes us do it
        &chat'expect($Data_handle, 30,
		"(.*)\r?\n",    'push(@ftp_list, $1)',
		"EOF",     '$ftp_trans_flag = 1');
    } until $ftp_trans_flag;

    &chat'close($Data_handle);
    return undef unless &do_ftp_cmd(226);

    grep(y/\r\n//d, @ftp_list);
    @ftp_list;
}  

#######################################################

sub do_open_dport { ## Private
    local(@foo, $port) = &chat'open_listen;
    ($port, $Data_handle) = splice(@foo, 4, 2);

    unless ($Data_handle) {
	$ftp_error =  "Unable to open data port: $!";
	return undef;
    }

    push(@foo, $port >> 8, $port & 0xff);
    local($myhost) = (join(',', @foo));
    
    &do_ftp_cmd(200, "port $myhost");
}

#######################################################
#
#  To cleanup after a problem
#

sub do_ftp_abort {
    die unless $NeedsClose;

    &chat'print($Control, "abor", "\r\n");
    &chat'close($Data_handle);
    &chat'expect($Control, 10, '.', undef);
    &chat'close($Control);

    close DFILE;
    unlink($NeedsCleanup) if $NeedsCleanup;
    die;
}

#######################################################
#
#  To set signals to do the abort properly
#

sub do_ftp_signals {
    local($flag, $sig) = @_;

    local ($old, $new) = ('DEFAULT', "ftp'do_ftp_abort");
    $flag || (($old, $new) = ($new, $old));
    foreach $sig (@sigs) {
	($SIG{$sig} == $old) && ($SIG{$sig} = $new);
    }
}

1;
-- 
Jack Lund                       Email: zippy@ccwf.cc.utexas.edu
Graphics Services               Phone: (512) 471-3241
UT Austin Computation Center
WWW: <A HREF="http://wwwhost.cc.utexas.edu/test/zippy/zippy.html">Zippy</A>!


