#!/usr/bin/perl # my $http_verbose_debug = 0; $unused_header = q` SimpleGet.pl -- standalone replacement for LWP::Simple For more perl goodness, go to mrob.com/pub/perl This is a fairly minimal implementation of the HTTP GET protocol in Perl. It only does the 'GET' method via http (not https), and handles proxies and redirects but nothing more complicated. It is designed to be a simple replacement for the LWP library for scripts that only use the get(), getprint() and/or getstore() routines. It should be noted that web client Perl scripts fall into two general categories: Simple scripts that just get a page and grab one small bit of information from it, and complex tools like browsers, robots, site-shadowing utilities, custom search engines, etc. For the simple category, the LWP library and the other libraries it depends on (over 1.5 megabytes) is overkill. To use this code, put it in a file called "SimpleGet.pl" somewhere in your Perl @INC searchpath, such as /usr/lib/perl5/site_perl then add this line to your Perl script: require "SimpleGet.pl"; Each of the following is equivalent (except that the last creates a file "temp.html" in the current directory): They load the sample URL and print the HTML to STDOUT, and set $err to the HTTP result (usually 200, unless my ISP's server is down): print get("http://www.mrob.com"); $err = $http_get_result; $_ = get("http://www.mrob.com"); print $_; $err = $http_get_result; $err = getprint("http://www.mrob.com"); $err = getstore("http://www.mrob.com", "temp.html"); system("cat temp.html"); get() reads the data via HTTP and returns it. getprint() sends the data to STDOUT with relatively low memory overhead (useful if the data is large) This library also invites one-liner shell commands such as: perl -e "require 'SimpleGet.pl'; getprint('http://www.mrob.com')" If you want to do anything more than loading simple pages and parsing their contents yourself, you should use LWP and the associated libraries. These libraries include such functions as MIME support, HTML parsing, handling of other transfer protocols like HTTPS and FTP, and much more. To learn more, see http://www.linpro.no/lwp/ Existing web-client scripts that contain 'use LWP::Simple;' and only call the get() and/or getprint() functions can be converted to use SimpleGet by replacing the 'use LWP::Simple;' line with 'require "SimpleGet.pl";' Two extensions have been added to the functionality provided by the LWP get() and getprint() routines: - Set $http_no_cache to 1 to force proxies to reload, or to 0 for a normal GET. - The variable $http_get_result is set to the result code (e.g. 200 or 404). (It is also returned by getprint() and getstore()) - Set $http_timeout to a value (in seconds), or leave unset for the default (60 seconds) Copyright, Usage, Feedback, etc: This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. The minimal adaptation of LWP::Simple was created by Robert Munafo (www.mrob.com) from the LWP code. If you have problems, suggestions, or feedback regarding this file, please send them to "mrob at mrob dot com". Please don't bother Gisle Aas (the primary author of LWP) about it because it isn't his creation. REVISION HISTORY 20000114 Initial version, derived from LWP::Simple, with minor additions to handle http_proxy 20000117 Add $http_get_result and get_to_stdout(). 20000118 Add $http_no_cache. Rename get_to_stdout() to getprint, because that's what it's called in LWP. 20000119 Add getstore(), is_success() and "1;" at end; make it work under "use strict;" 20000903 Add allow_post flag and POST handling; add post(), postprint(), and poststore(). Reference: http://www.oreilly.com/catalog/aspnut/chapter/ch06.html 20040531 On redirect, getstore and poststore close and reopen the file, so you get just the redirected data, without the redirecting page prepended (important if you're getting a binary or image file). 20050512 Add (nonstandard) optional Authorization: syntax 20100514 Add &get_pr_hdr and &post_pr_hdr 20120412 Don't try to reopen GET_OUTFILE in getprint mode (oops :) 20151111 Add HTTPS capability. Change AGENT and VERSION because my idiot ISP admins have decided to 403-block any useragent they haven't heard of. (They have told me this is to block "spoofing", which demonstrates how little they understand the issue. The Apache documentation on access control (httpd.apache.org/docs/2.2/howto/access.html) specifically points out, "Access control by User-Agent is an unreliable technique, since the User-Agent header can be set to anything at all, at the whim of the end user." 20171217 Add more debugging 20181005 Add timeout 20181008 Return undef on timeout failure NOTES A fairly good set of HTTP protocol documentation is at https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Encoding Result codes and timeouts can be tested by sending requests to https://httpstat.us/ with desired options, for example: https://httpstat.us/200 https://httpstat.us/200?sleep=5000 `; require 5.004; use MIME::Base64; my ($http_stream_out, %http_loop_check, $allow_post); # Return value is the data (if any) # Result code in $http_get_result sub _trivial_http_get { my($host, $port, $path, $auth401) = @_; my($AGENT, $VERSION, $p); my($ifpost, $postdata); print "_thg HOST=$host, PORT=$port, PATH=$path, AUTH=$auth401\n" if ($http_verbose_debug); $AGENT = "get-minimal"; $VERSION = "20151111"; # $AGENT = "Mozilla"; # $VERSION = "4.0 (compatible; MSIE 4.5; Mac_PowerPC)"; # $VERSION = "5.0 (Windows NT 6.2; Win64; x64; rv:27.0) Gecko/20121011 Firefox/27.0"; $AGENT = "Mozilla"; $VERSION = "5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0"; $path =~ s/ /%20/g; if ($allow_post) { if ($path =~ m/^(.+)\?(.+)$/) { $path = $1; $postdata = $2; $ifpost = 1; } else { $ifpost = 0; } } else { $ifpost = 0; } require IO::Socket; local($^W) = 0; # Enables all Perl warnings my $sock = IO::Socket::INET->new(PeerAddr => $host, PeerPort => $port, Proto => 'tcp', Timeout => 60); if(!($sock)) { print "_thg err no socket\n" if ($http_verbose_debug); return; } $sock->autoflush; my $netloc = $host; $netloc .= ":$port" if $port != 80; my $request = ($ifpost ? "POST" : "GET") . " $path HTTP/1.0\015\012" . "Host: $netloc\015\012" . "User-Agent: $AGENT/$VERSION\015\012" . "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,text/*;q=0.8,*/*;q=0.7\015\012" . "Accept-Encoding: *\015\012" . "Accept-Language: en-US,en;q=0.7,*;q=0.5\015\012" # . "Connection: close\015\012" # is default in HTTP/1.0 . "DNT: 1\015\012" # . "Referer: http://localhost/index.html\015\012" ; $request .= "Cache-Control: no-cache\015\012" if ($main::http_no_cache); if ($ifpost) { $request .= "Content-type: application/x-www-form-urlencoded\015\012"; $request .= "Content-length: " . length($postdata) . "\015\012"; } if ($auth401 ne "") { $request .= "Authorization: BASIC " . encode_base64($auth401) . "\015\012"; } $request .= "\015\012"; if ($ifpost) { $request .= $postdata . "\015\012"; } print $request if ($http_verbose_debug); print $sock $request; my $buf = ""; my $n; my $b1 = ""; my $hdr = ""; while ($n = sysread($sock, $buf, 8*1024, length($buf))) { if ($b1 eq "") { # first block? $b1 = $buf; # Save this for errorcode parsing $buf =~ s/.+?\015?\012\015?\012//s; # zap header $hdr = $b1; $hdr =~ s|^(.+?)\015?\012\015?\012.*$|\1|s; # zap all but header print ("\n\n<>e1\n\n" if ($http_verbose_debug); return undef; } $main::http_get_result = 200; if ($b1 =~ m,^HTTP/\d+\.\d+\s+(\d+)[^\012]*\012,) { print "\nfoo1\n" if ($http_verbose_debug); $main::http_get_result = $1; if (($main::http_get_result =~ /^30[1237]/) && ($b1 =~ /\012Location:\s*(\S+)/)) { # redirect my $url = $1; return undef if $http_loop_check{$url}++; # redirect must replace, not concatenate, output to a disk file. # if we're streaming, it will still concatenate. if ($http_stream_out) { if ($g_getstore_filepath eq 'stdout') { # Do not close and reopen } else { close $GET_OUTFILE; open($GET_OUTFILE, "> $g_getstore_filepath"); } } return _get($url); } if ($main::http_get_result =~ /^2/) { # Okay } else { # 404 not found, 501 server not available, etc. return undef; } } else { print "\nfoo2\n" if ($http_verbose_debug); } return $buf; } # End of _trivial.http_get # Return value is the data (if any) # Result code in $http_get_result sub _trivial_https_get { my($host, $port, $path, $auth401) = @_; my($AGENT, $VERSION, $p); my($ifpost, $postdata); print "_thsg HOST=$host, PORT=$port, PATH=$path, AUTH=$auth401\n" if ($http_verbose_debug); $AGENT = "get-minimal"; $VERSION = "20151111"; $AGENT = "Mozilla"; $VERSION = "4.0 (compatible; MSIE 4.5; Mac_PowerPC)"; $VERSION = "5.0 (Windows NT 6.2; Win64; x64; rv:27.0) Gecko/20121011 Firefox/27.0"; $path =~ s/ /%20/g; if ($allow_post) { if ($path =~ m/^(.+)\?(.+)$/) { $path = $1; $postdata = $2; $ifpost = 1; } else { $ifpost = 0; } } else { $ifpost = 0; } # %%% We need to check if this fails and do something better. use IO::Socket::SSL 'inet4'; local($^W) = 0; # Enables all Perl warnings print "_thsg open '$host:$port'\n" if ($http_verbose_debug); my $sock = IO::Socket::SSL->new(PeerAddr => "$host:https", PeerPort => $port, SSL_verify_mode => 0x00); if(!($sock)) { print ("_thsg err no socket '" . (IO::Socket::SSL::errstr()) . "'\n") if ($http_verbose_debug); return; } if (0) { # Verify the hostname. This does not seem to work, so it is disabled. if(!($sock->verify_hostname($host, 'http'))) { print ("_thsg hostname verification failed\n") if ($http_verbose_debug); return; } } $sock->autoflush; my $netloc = $host; $netloc .= ":$port" if $port != 443; my $request = ($ifpost ? "POST" : "GET") . " $path HTTP/1.0\015\012" . "Host: $netloc\015\012" . "User-Agent: $AGENT/$VERSION/u\015\012" . "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,text/*;q=0.8,*/*;q=0.7\015\012" . "Accept-Encoding: *\015\012" . "Accept-Language: en-US,en;q=0.7,*;q=0.5\015\012" # . "Connection: close\015\012" # is default in HTTP/1.0 . "DNT: 1\015\012" # . "Referer: http://localhost/index.html\015\012" ; $request .= "Cache-Control: no-cache\015\012" if ($main::http_no_cache); if ($ifpost) { $request .= "Content-type: application/x-www-form-urlencoded\015\012"; $request .= "Content-length: " . length($postdata) . "\015\012"; } if ($auth401 ne "") { $request .= "Authorization: BASIC " . encode_base64($auth401) . "\015\012"; } $request .= "\015\012"; if ($ifpost) { $request .= $postdata . "\015\012"; } print $request if ($http_verbose_debug); print $sock $request; my $buf = ""; my $n; my $b1 = ""; my $hdr = ""; while ($n = sysread($sock, $buf, 8*1024, length($buf))) { if ($b1 eq "") { # first block? $b1 = $buf; # Save this for errorcode parsing $buf =~ s/.+?\015?\012\015?\012//s; # zap header $hdr = $b1; $hdr =~ s|^(.+?)\015?\012\015?\012.*$|\1|s; # zap all but header if ($http_header_out) { print $GET_ERRFILE "$hdr\n"; $hdr = ""; } } if ($http_stream_out) { print $GET_OUTFILE $buf; $buf = ""; } } if (!(defined($n))) { return undef; } $main::http_get_result = 200; if ($b1 =~ m,^HTTP/\d+\.\d+\s+(\d+)[^\012]*\012,) { $main::http_get_result = $1; if (($main::http_get_result =~ /^30[1237]/) && ($b1 =~ /\012Location:\s*(\S+)/)) { # redirect my $url = $1; return undef if $http_loop_check{$url}++; # redirect must replace, not concatenate, output to a disk file. # if we're streaming, it will still concatenate. if ($http_stream_out) { if ($g_getstore_filepath eq 'stdout') { # Do not close and reopen } else { close $GET_OUTFILE; open($GET_OUTFILE, "> $g_getstore_filepath"); } } return _get($url); } if ($main::http_get_result =~ /^2/) { # Okay } else { return undef; } } return $buf; } # End of _trivial.https_get # Wrapper for _trivial.http_get and _trivial.https_get that implements # timeout. This is based on the example usage from the documentation for # perl's alarm function, perldoc.perl.org/functions/alarm.html: # eval { # local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required # alarm $timeout; # my $nread = sysread $socket, $buffer, $size; # alarm 0; # }; # if ($@) { # die unless $@ eq "alarm\n"; # propagate unexpected errors # # timed out # } # else { # # didn't # } sub _thgto { my($host, $port, $path, $auth401, $ssl) = @_; my($rv, $to); $to = ($main::http_timeout) + 0; if ($to <= 0) { $to = 60; } print "_thgto to=$to ssl=$ssl\n" if ($http_verbose_debug); $rv = undef; eval { local $SIG{ALRM} = sub { die "alarm\n" }; alarm $to; # Causes an ALRM signal unless we reach "alarm 0" in time if ($ssl) { $rv = _trivial_https_get($host, $port, $path, $auth401); } else { $rv = _trivial_http_get($host, $port, $path, $auth401); } alarm 0; # Cancel the ALRM - we got here in time }; if ($@) { die unless $@ eq "alarm\n"; # propagate unexpected errors # timed out } return $rv; } # End of thg.to # Return value is the data (if any) # Result code in $http_get_result sub _get { my $url = shift; my $proxy = ""; my($auth401); print "_get $url\n" if ($http_verbose_debug); # To get web pages that require a name and password, append a string like # " Authorization: myname:mysecret" $auth401 = ""; if ($url =~ m/^(.+) Authorization: (.+)$/) { $url = $1; $url =~ s/ +$//; $auth401 = $2; $auth401 =~ s/ //g; } grep {(lc($_) eq "http_proxy") && ($proxy = $ENV{$_})} keys %ENV; if (($proxy eq "") && $url =~ m,^http://([^/:]+)(?::(\d+))?(/\S*)?$,) { print "_get branch1 |$url|$auth401|\n" if ($http_verbose_debug); my $host = $1; my $port = $2 || 80; my $path = $3; $path = "/" unless defined($path); return _thgto($host, $port, $path, $auth401, 0); } elsif ($proxy =~ m,^http://([^/:]+):(\d+)(/\S*)?$,) { print "_get branch2\n" if ($http_verbose_debug); my $host = $1; my $port = $2; my $path = $url; return _thgto($host, $port, $path, $auth401, 0); } elsif (($proxy eq "") && $url =~ m,^https://([^/:]+)(?::(\d+))?(/\S*)?$,) { print "_get branch3 |$url|$auth401|\n" if ($http_verbose_debug); my $host = $1; my $port = $2 || 443; my $path = $3; $path = "/" unless defined($path); return _thgto($host, $port, $path, $auth401, 1); } elsif ($proxy =~ m,^https://([^/:]+):(\d+)(/\S*)?$,) { print "_get branch4\n" if ($http_verbose_debug); my $host = $1; my $port = $2; my $path = $url; return _thgto($host, $port, $path, $auth401, 1); } else { print "_get branch5 |$proxy|$url|\n" if ($http_verbose_debug); return undef; } } # Return value is the data (if any) # Result code in $http_get_result sub get ($) { $http_stream_out = 0; $http_header_out = 0; $allow_post = 0; %http_loop_check = (); goto \&_get; } sub getprint ($) { my $url = shift; $allow_post = 0; $http_stream_out = 1; $g_getstore_filepath = 'stdout'; open($GET_OUTFILE, ">&STDOUT"); $http_header_out = 0; %http_loop_check = (); _get($url); close $GET_OUTFILE; return $main::http_get_result; } sub getstore ($$) { my $url = shift; my $file = shift; $allow_post = 0; $http_stream_out = 1; $http_header_out = 0; $g_getstore_filepath = $file; open($GET_OUTFILE, "> $file"); %http_loop_check = (); _get($url); close $GET_OUTFILE; return $main::http_get_result; } # Like getprint but also routes header to stderr sub get_pr_hdr ($) { my $url = shift; $allow_post = 0; $http_stream_out = 1; $g_getstore_filepath = 'stdout'; open($GET_OUTFILE, ">&STDOUT"); $http_header_out = 2; open($GET_ERRFILE, ">&STDERR"); %http_loop_check = (); _get($url); close $GET_OUTFILE; return $main::http_get_result; } sub post ($) { $http_stream_out = 0; $http_header_out = 0; $allow_post = 1; %http_loop_check = (); goto \&_get; } sub postprint ($) { my $url = shift; $allow_post = 1; $http_stream_out = 1; open($GET_OUTFILE, ">&STDOUT"); $http_header_out = 0; %http_loop_check = (); _get($url); close $GET_OUTFILE; return $main::http_get_result; } # Like postprint but also routes header to stderr sub post_pr_hdr ($) { my $url = shift; $allow_post = 1; $http_stream_out = 1; open($GET_OUTFILE, ">&STDOUT"); $http_header_out = 2; open($GET_ERRFILE, ">&STDERR"); %http_loop_check = (); _get($url); close $GET_OUTFILE; return $main::http_get_result; } sub poststore ($$) { my $url = shift; my $file = shift; $allow_post = 1; $http_stream_out = 1; $http_header_out = 0; $g_getstore_filepath = $file; open($GET_OUTFILE, "> $file"); %http_loop_check = (); _get($url); close $GET_OUTFILE; return $main::http_get_result; } sub is_success ($) { my $code = shift; return ($code =~ /^2/); } 1; # end of SimpleGet.pl 013