#!/usr/bin/perl -T
## -----------------------------------------------------------------------
##
##   Copyright 2011 Intel Corporation; author: H. Peter Anvin
##
##   This program is free software; you can redistribute it and/or
##   modify it under the terms of the GNU General Public License as
##   published by the Free Software Foundation, Inc.; either version 2
##   of the License, or (at your option) any later version;
##   incorporated herein by reference.
##
## -----------------------------------------------------------------------

#
# kernel.org bulk file upload client
#

use strict;
use warnings;
use bytes;
use Encode qw(encode decode);
use File::Spec;

my $blksiz = 1024*1024;

# Global options
my %opt = (
    'rsh'     => 'ssh -a -x -k -T',
    'host'    => 'kup.kernel.org',
    'batch'   => 0,
    'verbose' => 0,
    );

# This is a client, and so running with tainting on is a bit overly
# paranoid.  As a result we have to explicitly untaint certain bits from
# the environment.
sub untaint($) {
    my($s) = @_;

    $s =~ /^(.*)$/;
    return $1;
}

$ENV{'PATH'} = untaint($ENV{'PATH'});
if (defined $ENV{'KUP_RSH'}) {
    $opt{'rsh'} = $ENV{'KUP_RSH'};
}
if (defined $ENV{'KUP_HOST'}) {
    $opt{'host'} = $ENV{'KUP_HOST'};
}
delete @ENV{qw(IFS CDPATH ENV BASH_ENV)};   # Make %ENV safer

# We process the command set twice, once as a dry run and one for real,
# to catch as many errors as early as possible
my @args;
my $real;

# Usage description
sub usage($) {
    my($err) = @_;

    print STDERR "Usage: $0 [global options] command [-- command...]\n";
    print STDERR "\n";
    print STDERR "Global options:\n";
    print STDERR "   -b  --batch             Output command stream to stdout\n";
    print STDERR "   -e  --rsh=command       Send output to command, override KUP_RSH\n";
    print STDERR "   -o  --host=[user@]host  Connect to [user@]host, override KUP_HOST\n";
    print STDERR "   -v  --verbose           Print each command to stderr as it is sent\n";
    print STDERR "\n";
    print STDERR "Commands:\n";
    print STDERR "   put local_file signature remote_path\n";
    print STDERR "   put --tar [--prefix=] remote_tree ref signature remote_path\n";
    print STDERR "   put --diff remote_tree ref1 ref2 signature remote_path\n";
    print STDERR "   mkdir remote_path\n";
    print STDERR "   mv|move old_path new_path\n";
    print STDERR "   ln|link old_path new_path\n";
    print STDERR "   rm|del|delete old_path\n";
    print STDERR "   ls|dir path...\n";

    exit $err;
}

# Return true if the supplied string is valid UTF-8 without special
# characters
sub is_clean_string($)
{
    no bytes;
    # use feature 'unicode_strings';	-- is this needed here?

    my($b) = @_;
    my $f = decode('UTF-8', $b, Encode::FB_DEFAULT);

    return 0 if ($f =~ m:[\x{0000}-\x{001f}\x{007f}-\x{00a0}\x{fffd}-\x{ffff}]:);
    return 1;
}

# This returns true if the given argument is a valid filename in its
# canonical form.  Double slashes, relative paths, dot files, control
# characters, and malformed UTF-8 is not permitted.  We cap the length
# of each pathname component to 100 bytes so we can add an extension
# without worrying about it, and the entire pathname to 1024 bytes.
sub is_valid_filename($)
{
    use bytes;

    my($f) = @_;

    return 0 if (!defined($f));	# If undefined, clearly not valid

    return 0 if (length($f) > 1024); # Reject ridiculously long paths
    return 0 if (!is_clean_string($f)); # Reject bad UTF-8 and control characters
    return 0 if ($f !~ m:^/:);	# Reject relative paths
    return 0 if ($f =~ m:/$:);	# Reject paths ending in /
    return 0 if ($f =~ m://:);	# Reject double slashes

    # Reject filename components starting with dot or dash, covers . and ..
    return 0 if ($f =~ m:/[\.\-]:);

    # Reject undesirable filename characters anywhere in the name.
    # This isn't inherently security-critical, and could be tuned if
    # users need it...
    return 0 if ($f =~ m:[\!\"\$\&\'\*\;\<\>\?\\\`\|]:);

    # Make sure we can create a filename after adding .bz2 or similar.
    # We can't use the obvious regexp here, because regexps operate on
    # characters, not bytes.  The limit of 100 is semi-arbitrary, but
    # we shouldn't need filenames that long.
    my $n = 0;
    my $nmax = 0;
    for (my $i = 0; $i < length($f); $i++) {
	my $c = substr($f, $i, 1);
	$n = ($c eq '/') ? 0 : $n+1;
	$nmax = ($n > $nmax) ? $n : $nmax;
    }
    return 0 if ($nmax > 100);

    return 1;
}

# Clean up a filename so that it is more likely to pass the
# canonicalization test.  An optional second argument is used with
# two-filename commands (move, link); it should be the already
# canonicalized first argument.
#
# This can return undef for some invalid pathnames.  This needs to be
# caught by is_valid_filename().
sub canonicalize_path($;$)
{
    my($file, $root) = @_;

    $root = '/' unless defined($root);

    my $tail = '';
    if ($root =~ m:^(.*/)([^/]+)$:) {
	$root = $1;
	$tail = $2;
    }

    if ($root !~ m:^/: || $root !~ m:/$:) {
	die "$0: internal error: non-canonical root\n";
    }

    if ($file !~ m:^/:) {
	$file = $root . $file;
    }
    if ($file =~ m:/$:) {
	$file .= $tail;
    }

    my @path = ();
    my $wasspc = 1;
    # The -1 argument to split means "preserve trailing empty fields"
    foreach my $s (split(/\//, $file, -1)) {
	if ($s eq '' || $s eq '.') {
	    $wasspc = 1;
	} elsif ($s eq '..') {
	    # If this ran off the root, error
	    return undef if (!defined(pop(@path)));
	    $wasspc = 1;
	} else {
	    push(@path, $s);
	    $wasspc = 0;
	}
    }

    # If this ended in a special component, error
    return undef if ($wasspc);

    # The initial '' forces the result to begin with a slash
    return join('/', '', @path);
}

# Parse global options
sub parse_global_options()
{
    while (scalar @ARGV && $ARGV[0] =~ /^-/) {
	my $arg = shift(@ARGV);

	if ($arg eq '-b' || $arg eq '--batch') {
	    $opt{'batch'} = 1;
	} elsif ($arg eq '-e' || $arg eq '--rsh' || $arg eq '--ssh') {
	    $opt{'rsh'} = shift(@ARGV);
	} elsif ($arg =~ /^--rsh=(.+)$/) {
	    $opt{'rsh'} = $1;
	} elsif ($arg eq '-o' || $arg eq '--host') {
	    $opt{'host'} = shift(@ARGV);
	} elsif ($arg =~ /^--host=(.+)$/) {
	    $opt{'host'} = $1;
	} elsif ($arg eq '-v' || $arg eq '--verbose') {
	    $opt{'verbose'}++;
	} elsif ($arg eq '-h' || $arg eq '--help') {
	    usage(0);
	} else {
	    die "$0: unknown option: $arg\n";
	}
    }
}

# Encode a string
sub url_encode($)
{
    my($s) = @_;

    # Hack to encode an empty string
    return '%' if ($s eq '');

    my $o = '';

    foreach my $c (unpack("C*", $s)) {
	if ($c > 32 && $c < 126 && $c != 37 && $c != 43) {
	    $o .= chr($c);
	} elsif ($c == 32) {
	    $o .= '+';
	} else {
	    $o .= sprintf("%%%02X", $c);
	}
    }

    return $o;
}

# Configure the output stream
sub setup_output()
{
    # In batch mode, we dump the output to stdout so the user can
    # aggregate it best they wish
    unless ($opt{'batch'}) {
	if ($opt{'rsh'} !~ /^([-a-zA-Z0-9._=\@:\s]+)$/) {
	    die "$0: suspicious KUP_RSH command (if this is bogus let hpa know)\n";
	}
	my $rsh = $1;
	if ($opt{'host'} !~ /^([-a-zA-Z0-9._\@]+)$/) {
	    die "$0: suspicious KUP_HOST (if this is bogus let hpa know)\n";
	}
	$rsh .= " \Q$1";
	open(STDOUT, '|-', $rsh)
	    or die "$0: cannot execute rsh command ", $rsh, "\n";
    }
    binmode(STDOUT);
}

# Terminate the output process
sub close_output()
{
    $| = 1;			# Flush STDOUT
    unless ($opt{'batch'}) {
	close(STDOUT);
    }
}

# Print a command to STDOUT, and if requested, to STDERR
sub command(@)
{
    if ($real) {
	my $cmd = join(' ', @_);

	print STDERR $cmd, "\n" if ($opt{'verbose'});
	print $cmd, "\n";
    }
}

sub get_data_format($)
{
    my($data) = @_;

    my $magic2 = substr($data, 0, 2);
    my $magic4 = substr($data, 0, 4);
    my $magic6 = substr($data, 0, 6);

    my $fmt = '%';		# Meaning straight binary

    if ($magic2 eq "\037\213") {
	$fmt = 'gz';
    } elsif ($magic4 =~ /^BZh[1-9]$/) {
	# The primary bzip2 magic is so crappy, so look
	# for the magic number of the first packet
	# (either a compression packet or an end of file packet.)
	# Funny enough, the magics on the packets are better
	# than the magics on the file format, and even so
	# they managed to pick a magic for the compression
	# packet which has no non-ASCII bytes in it...

	my $submagic = substr($data, 4, 6);

	if ($submagic eq "\x31\x41\x59\x26\x53\x59" ||
	    $submagic eq "\x17\x72\x45\x38\x50\x90") {
	    $fmt = 'bz2';
	}
    } elsif ($magic6 eq "\x{fd}7zXZ\0") {
	$fmt = 'xz';
    }

    return $fmt;
}

sub cat_file($$$)
{
    my($cmd, $file, $fmt) = @_;

    my $data;
    open($data, '<', $file)
	or die "$0: cannot open: $file: $!\n";
    if (! -f $data) {
	die "$0: not a plain file: $file\n";
    }
    my $size = -s _;

    binmode($data);

    if ($real) {
	if ($size < 2) {
	    # Must be a plain file
	    $fmt = '%';
	}

	if (defined($fmt)) {
	    command($cmd, $size, $fmt);
	}

	my $blk;
	my $len;

	while ($size) {
	    $len = ($size < $blksiz) ? $size : $blksiz;
	    $len = read($data, $blk, $len);

	    if (!$len) {
		die "$0: premature end of data (file changed?): $file\n";
	    }

	    if (!defined($fmt)) {
		$fmt = get_data_format($blk);
		command($cmd, $size, $fmt);
	    }

	    print $blk;
	    $size -= $len;
	}
    }

    close($data);
}

# PUT command
sub cmd_put()
{
    my $file = shift @args;
    my $file_tail = undef;

    if ($file eq '-t' || $file eq '--tar') {
	# tar hack

	my $remote_tree = shift @args;
	my $prefix = '';

	if ($remote_tree eq '-p' || $remote_tree eq '--prefix') {
	    $prefix = shift @args;
	    $remote_tree = shift @args;
	} elsif ($remote_tree =~ /^--prefix=(.+)$/) {
	    $prefix = $1;
	    $remote_tree = shift @args;
	}

	my $ref = shift(@args);

	if (!defined($ref)) {
	    usage(1);
	}

	my $xrt = $remote_tree;
	$remote_tree = canonicalize_path($remote_tree);
	if (!is_valid_filename($remote_tree) || $remote_tree !~ /\.git$/) {
	    die "$0: invalid path name for git tree: $xrt\n";
	}
	if (!is_clean_string($ref)) {
	    die "$0: invalid ref: $ref\n";
	}

	command('TAR', url_encode($remote_tree),
		url_encode($ref), url_encode($prefix));
    } elsif ($file eq '-d' || $file eq '--diff') {
	# diff hack

	my $remote_tree = shift @args;
	my $prefix = '';

	my $ref1 = shift(@args);
	my $ref2 = shift(@args);

	if (!defined($ref2)) {
	    usage(1);
	}

	my $xrt = $remote_tree;
	$remote_tree = canonicalize_path($remote_tree);
	if (!is_valid_filename($remote_tree) || $remote_tree !~ /\.git$/) {
	    die "$0: invalid path name for git tree: $xrt\n";
	}
	if (!is_clean_string($ref1)) {
	    die "$0: invalid ref: $ref1\n";
	}
	if (!is_clean_string($ref2)) {
	    die "$0: invalid ref: $ref2\n";
	}

	command('DIFF', url_encode($remote_tree), url_encode($ref1),
		url_encode($ref2));
    } elsif ($file =~ /^-/) {
	die "$0: unknown option to put command: $file\n";
    } else {
	# Plain data blob.  We don't actively attempt to compress it
	# since ssh usually has a layer of compression, but if it is
	# already a compressed file we send it as-is and let the
	# server decompress it.

	cat_file('DATA', $file, undef);

	# Get the local filename without directory
	my($vol, $dir);
	($vol, $dir, $file_tail) = File::Spec->splitpath($file);
    }

    my $sign   = shift @args;
    my $remote = shift @args;

    if (!defined($remote)) {
	usage(1);
    }

    # This allows the user to not specify the filename if it is
    # the same as on the local filesystem by ending the pathname
    # with a slash
    if ($remote =~ m:/$: && defined($file_tail)) {
	$remote .= $file_tail;
    }
    
    my $xrt = $remote;
    $remote = canonicalize_path($remote);
    if (!is_valid_filename($remote)) {
	die "$0: invalid pathname: $xrt\n";
    }

    if ($remote =~ /\.sign$/) {
	die "$0: target filename cannot end in .sign\n";
    }

    # DWIM: .bz2, .xz -> .gz
    $remote =~ s/\.(bz2|xz)$/.gz/;

    cat_file('SIGN', $sign, undef);
    command('PUT', url_encode($remote));
}

# MKDIR command
sub cmd_mkdir()
{
    my $remote = shift @args;

    if (!defined($remote)) {
	usage(1);
    }

    my $xrt = $remote;
    $remote = canonicalize_path($remote);
    if (!is_valid_filename($remote)) {
	die "$0: invalid pathname: $xrt\n";
    }

    if ($remote =~ /\.(sign|gz|bz2|xz)$/) {
	die "$0: a directory name cannot end in .sign, .gz, .bz2, .xz\n";
    }

    command('MKDIR', url_encode($remote));
}

# DELETE command
sub cmd_delete()
{
    my $remote = shift @args;

    if (!defined($remote)) {
	usage(1);
    }

    my $xrt = $remote;
    $remote = canonicalize_path($remote);
    if (!is_valid_filename($remote)) {
	die "$0: invalid pathname: $xrt\n";
    }

    if ($remote =~ /\.sign$/) {
	die "$0: cannot delete .sign files directly\n";
    }

    # DWIM: .bz2, .xz -> .gz
    $remote =~ s/\.(bz2|xz)$/.gz/;

    command('DELETE', url_encode($remote));
}

# MOVE or LINK command
sub cmd_move_link($)
{
    my($cmd) = @_;

    my $from = shift @args;
    my $to   = shift @args;

    if (!defined($to)) {
	usage(1);
    }

    my $xrt = $from;
    $from = canonicalize_path($from);
    if (!is_valid_filename($from)) {
	die "$0: invalid pathname: $xrt\n";
    }

    $xrt = $to;
    $to = canonicalize_path($to, $from);
    if (!is_valid_filename($to)) {
	die "$0: invalid pathname: $xrt\n";
    }

    if ($from =~ /\.sign$/ || $to =~ /\.sign$/) {
	die "$0: cannot explicitly move .sign files\n";
    }
    if ($from =~ /\.(gz|bz2|xz)$/ && $to =~ /\.(gz|bz2|xz)$/) {
	$from =~ s/\.(bz2|xz)$/.gz/;
	$to   =~ s/\.(bz2|xz)$/.gz/;
    } elsif ($from =~ /\.(gz|bz2|xz)$/ || $to =~ /\.(gz|bz2|xz)$/) {
	die "$0: cannot move to or from compressed filenames\n";
    }

    if ($from eq $to) {
	die "$0: moving filename to self: $from\n";
    }

    command($cmd, url_encode($from), url_encode($to));
}

# DIR command (supports arbitrary number of arguments)
sub cmd_dir()
{
    while (defined($args[0]) && $args[0] ne '--') {
	my $d = shift @args;
	$d =~ s:/$::g;
	if ($d ne '') {
	    my $xrt = $d;
	    $d = canonicalize_path($d);
	    if (!is_valid_filename($d)) {
		die "$0: invalid pathname: $xrt\n";
	    }
	}
	$d .= '/';

	command('DIR', $d);
    }
}

# Process commands
sub process_commands()
{
    while (1) {
	my $cmd = shift(@args);

	if (!defined($cmd)) {
	    usage(1);
	}

	$cmd = "\L${cmd}";

	if ($cmd eq 'put') {
	    cmd_put();
	} elsif ($cmd eq 'mkdir') {
	    cmd_mkdir();
	} elsif ($cmd eq 'move' || $cmd eq 'mv') {
	    cmd_move_link('MOVE');
	} elsif ($cmd eq 'link' || $cmd eq 'ln') {
	    cmd_move_link('LINK');
	} elsif ($cmd eq 'delete' || $cmd eq 'del' || $cmd eq 'rm') {
	    cmd_delete();
	} elsif ($cmd eq 'ls' || $cmd eq 'dir') {
	    cmd_dir();
	} else {
	    die "$0: unknown command: $cmd\n";
	}

	my $sep = shift(@args);

	last if (!defined($sep)); # End of command line

	if ($sep ne '--') {
	    die "$0: garbage at end of $cmd command\n";
	}
    }
}

# Main program
parse_global_options();

# "Dry run" pass
$real = 0;
@args = @ARGV;
process_commands();

# Establish output stream
setup_output();

# "Real" pass
$real = 1;
@args = @ARGV;
process_commands();

# Close the output to allow the child process to complete
close_output();

exit 0;
