#!/usr/bin/env perl

use strict;
use warnings;
use autodie qw(:all);

use App::Test::Generator;
use File::Spec;
use File::Temp;
use Getopt::Long qw(GetOptions);
use Pod::Usage;
use YAML::XS qw(LoadFile);

=head1 NAME

fuzz-harness-generator - Generate fuzzing + corpus-based test harnesses from test schemas

=head1 SYNOPSIS

  fuzz-harness-generator [-r] [-o output_file] input.yaml
  fuzz-harness-generator --dry-run input.yaml
  fuzz-harness-generator --replay-corpus schemas/corpus/ -o t/fuzz_replay.t
  fuzz-harness-generator --replay-corpus schemas/corpus/translate.json -o t/fuzz_replay.t

=head1 DESCRIPTION

This tool generates a test file that fuzzes and validates a target module's function or method,
using both randomized fuzz cases and a static corpus cases (Perl or YAML).

It can also generate regression test files from corpus JSON files previously
written by C<extract-schemas --fuzz>, using C<--replay-corpus>.

A starter C<input.yaml> can be created using C<extract-schemas> which is also in this package.

=head1 OPTIONS

=over 4

=item B<--help>

Show this help.

=item B<--input>

The input configuration file

=item B<--output>

The (optional) output file.

=item B<--dry-run>

Validate the input configuration and schema extraction without writing any output files or running tests.

=item B<--run>

Call C<prove> on the output file.

C<fuzz-harness-generator -r t/conf/data_text_append.conf> will, therefore, dynamically create and run tests on the C<append> method of L<Data::Text>

=item B<--replay-corpus> PATH

Instead of generating a fuzz harness, generate a regression test file from
one or more corpus JSON files previously written by C<extract-schemas --fuzz>.

PATH may be either:

=over 4

=item * A single corpus file, e.g. C<schemas/corpus/translate.json>

=item * A directory, e.g. C<schemas/corpus/> — all C<*.json> files in that
directory will be included

=back

The generated test file contains one failing test per bug recorded in the
corpus. Each test calls the target method with the exact input that previously
caused a crash and expects it B<not> to die. Tests will be red until the
underlying bug is fixed, at which point they go green and stay green —
acting as permanent regression tests.

Only corpus entries with recorded bugs are included. Clean corpus entries
(inputs that did not cause a bug) are ignored.

=item B<--version>

Prints the version of L<App::Test::Generator>

=back

=cut

my $infile;
my $outfile;
my $help;
my $run;
my $verbose;
my $version;
my $dry_run;
my $replay_corpus;

Getopt::Long::Configure('bundling');

GetOptions(
	'help|h'          => \$help,
	'input|i=s'       => \$infile,
	'dry-run|n'       => \$dry_run,
	'output|o=s'      => \$outfile,
	'run|r'           => \$run,
	'verbose|v'       => \$verbose,
	'version|V'       => \$version,
	'replay-corpus|R=s' => \$replay_corpus,
) or pod2usage(2);

pod2usage(-exitval => 0, -verbose => 1) if($help);

if($version) {
	print $App::Test::Generator::VERSION, "\n";
	exit 0;
}

# ---------------------------------------------------------------------------
# --replay-corpus mode: generate a regression .t from corpus bug entries
# ---------------------------------------------------------------------------

if($replay_corpus) {
	pod2usage('--replay-corpus cannot be combined with --dry-run') if $dry_run;
	pod2usage('--replay-corpus cannot be combined with --input')   if $infile;

	my @corpus_files = _collect_corpus_files($replay_corpus);
	die "No corpus JSON files found at: $replay_corpus\n" unless @corpus_files;

	my $tap = _generate_replay_tap(@corpus_files);

	if($outfile) {
		open(my $fh, '>', $outfile)
			or die "Cannot write to $outfile: $!";
		print $fh $tap;
		close $fh;
		chmod 0755, $outfile;
		print "Replay test written to: $outfile\n";
		if($run) {
			exit system('prove', '-l', $outfile) >> 8;
		}
	} else {
		print $tap;
	}
	exit 0;
}

if($infile && @ARGV) {
	pod2usage('Specify input file either as argument or via --input, not both');
}

if($infile) {
	my $schema = eval { LoadFile($infile) };
	if($@) {
		die "Cannot parse '$infile' as YAML: $@";
	}
	unless(ref($schema) eq 'HASH') {
		die "Input file '$infile' does not contain a YAML hash";
	}
	unless($schema->{function}) {
		die "Input file '$infile' is missing required 'function' key";
	}
}

$infile ||= shift @ARGV or pod2usage('No config file given');

if($dry_run && $run) {
	pod2usage('--dry-run cannot be used with --run');
}

if($dry_run && $outfile) {
	warn '--dry-run specified; --output will be ignored';
}

if($verbose) {
	$ENV{'TEST_VERBOSE'} = 1;
}

if($run && !$outfile) {
	my ($fh, $tmp) = File::Temp::tempfile();
	close $fh;

	App::Test::Generator->generate($infile, $tmp);

	exit system('prove', '-l', $tmp) >> 8;
}

if($dry_run) {
	my ($fh, $tmp) = File::Temp::tempfile();
	close $fh;

	eval {
		App::Test::Generator->generate($infile, $tmp);
		1;
	} or do {
		die "Dry-run failed for $infile: $@";
	};

	unlink $tmp;
	print "Dry-run OK: $infile parsed and validated successfully\n";
	exit 0;
} elsif($outfile && -e $outfile && !$run) {
	warn "Overwriting existing file: $outfile";
}

App::Test::Generator->generate($infile, $outfile);

if($outfile) {
	chmod 0755, $outfile if($outfile =~ /\.(pl|cgi)$/);
	if($run) {
		# Use list form to avoid shell interpolation of $outfile
		system('prove', '-l', $outfile);
	}
}

exit 0;

# ---------------------------------------------------------------------------
# Helpers for --replay-corpus
# ---------------------------------------------------------------------------

# --------------------------------------------------
# _collect_corpus_files
#
# Purpose:    Collect the list of corpus JSON files
#             to process for --replay-corpus mode.
#             Accepts either a single file path or
#             a directory, returning all *.json files
#             found in the directory case.
#
# Entry:      $path - filesystem path to either a
#                     single .json file or a directory
#                     containing .json files.
#
# Exit:       Returns a sorted list of file paths.
#             Returns an empty list if the path does
#             not exist or contains no .json files.
#
# Side effects: None.
#
# Notes:      Directory globbing matches only *.json
#             files at the top level of the directory;
#             subdirectories are not recursed into.
# --------------------------------------------------
sub _collect_corpus_files {
	my ($path) = @_;

	if(-f $path) {
		return ($path);
	} elsif(-d $path) {
		my @files = glob(File::Spec->catfile($path, '*.json'));
		return sort @files;
	}

	return ();
}

# --------------------------------------------------
# _generate_replay_tap
#
# Purpose:    Read one or more corpus JSON files and
#             produce a complete .t file as a string.
#             Each bug entry in the corpus becomes
#             one lives_ok test that calls the target
#             method with the exact input that
#             previously caused a crash, asserting
#             that it no longer dies.
#
# Entry:      @corpus_files - list of paths to corpus
#                             JSON files as returned
#                             by _collect_corpus_files.
#
# Exit:       Returns the complete .t file content as
#             a string. Never returns undef.
#             Returns a skip_all plan if no bugs are
#             found across all corpus files.
#
# Side effects: Reads corpus JSON files from disk.
#               Attempts to load JSON::MaybeXS or
#               JSON via block eval.
#
# Notes:      Corpus files that cannot be parsed are
#             skipped with a warning rather than
#             aborting the entire run.
#             Clean corpus entries (those without
#             recorded bugs) are silently ignored —
#             only entries with a 'bugs' array are
#             processed.
#             The module name for each test is
#             inferred from the YAML schema file
#             alongside the corpus file via
#             _infer_module_from_schema. Falls back
#             to 'UNKNOWN::Module' if not found.
# --------------------------------------------------
sub _generate_replay_tap {
	my (@corpus_files) = @_;

	# Prefer JSON::MaybeXS for correctness; fall back to JSON
	my $json_module;
	for my $mod (qw(JSON::MaybeXS JSON)) {
		eval { require $mod; 1 } and $json_module = $mod and last;
	}
	die "No JSON module available; install JSON or JSON::MaybeXS\n"
		unless $json_module;

	# Collect all bugs across all corpus files into a flat list
	my @tests;

	for my $file (@corpus_files) {
		open(my $fh, '<', $file)
			or die "Cannot read $file: $!";
		my $data = eval {
			$json_module->new->decode(do { local $/; <$fh> })
		};
		close $fh;

		if($@) {
			warn "Skipping $file: could not parse JSON: $@\n";
			next;
		}

		my $bugs = $data->{'bugs'} // [];
		next unless @{$bugs};

		# Derive method name from filename: translate.json -> translate
		my (undef, undef, $fname) = File::Spec->splitpath($file);
		(my $method = $fname) =~ s/\.json$//;

		# Look up the module name from the companion schema file;
		# fall back to a placeholder if the schema cannot be found
		my $module = _infer_module_from_schema($file, $method)
			// 'UNKNOWN::Module';

		for my $bug (@{$bugs}) {
			push @tests, {
				module => $module,
				method => $method,
				input  => $bug->{'input'},
				error  => $bug->{'error'},
				file   => $file,
			};
		}
	}

	# Build the .t header — include Test::Exception up front since
	# lives_ok is always needed when there are tests to emit
	my $t = <<'HEADER';
#!/usr/bin/env perl
# Auto-generated by fuzz-harness-generator --replay-corpus
# DO NOT EDIT - regenerate from corpus files instead
use strict;
use warnings;
use Test::More;
use Test::Exception;
HEADER

	my $test_count = scalar @tests;

	if($test_count == 0) {
		$t .= "\nplan skip_all => 'No bugs recorded in corpus files';\n";
		return $t;
	}

	# Emit one use statement per unique module (excluding the placeholder)
	my %modules = map { $_->{'module'} => 1 } @tests;
	for my $mod (sort keys %modules) {
		next if $mod eq 'UNKNOWN::Module';
		$t .= "use $mod;\n";
	}

	$t .= "\nplan tests => $test_count;\n\n";

	for my $i (0 .. $#tests) {
		my $test  = $tests[$i];
		my $n     = $i + 1;
		my $input = _format_input($test->{'input'});
		my $label = "$test->{'method'} does not die on input from $test->{'file'}";

		# Flatten and escape the original error for use as a comment
		(my $orig_error = $test->{'error'} // '') =~ s/\n/ /g;
		$orig_error =~ s/'/\\'/g;

		$t .= "# Corpus bug: $orig_error\n";
		$t .= "lives_ok { $test->{'module'}\->$test->{'method'}($input) }\n";
		$t .= "    '$label';\n\n";
	}

	return $t;
}

# --------------------------------------------------
# _format_input
#
# Purpose:    Format a scalar input value as a Perl
#             literal string suitable for embedding
#             directly in generated test source code.
#
# Entry:      $input - the input value to format.
#                      May be undef, a numeric string,
#                      or an arbitrary string.
#
# Exit:       Returns a Perl literal string:
#               'undef'     if $input is undef
#               bare number if $input looks numeric
#               single-quoted string otherwise, with
#               backslashes and single quotes escaped.
#
# Side effects: None.
#
# Notes:      Only scalar inputs are handled — corpus
#             entries with arrayref or hashref inputs
#             are not currently supported and will be
#             formatted as a single-quoted string of
#             the stringified reference, which will
#             not reproduce the original input.
# --------------------------------------------------
sub _format_input {
	my ($input) = @_;

	return 'undef' unless defined $input;

	# Emit bare numeric literals without quoting
	return $input if $input =~ /^-?(?:\d+\.?\d*|\.\d+)$/;

	# Escape backslashes first, then single quotes, to avoid
	# double-escaping when both appear in the same string
	(my $escaped = $input) =~ s/\\/\\\\/g;
	$escaped =~ s/'/\\'/g;

	return "'$escaped'";
}

# --------------------------------------------------
# _infer_module_from_schema
#
# Purpose:    Attempt to determine the Perl module
#             name for a given corpus method by
#             locating and reading the companion YAML
#             schema file that sits alongside the
#             corpus directory.
#
# Entry:      $corpus_file - path to the corpus JSON
#                            file, e.g.
#                            schemas/corpus/translate.json
#             $method      - the method name derived
#                            from the corpus filename,
#                            e.g. 'translate'
#
# Exit:       Returns the module name string if found,
#             or undef if no companion schema file
#             exists or the schema contains no
#             'module:' line.
#
# Side effects: Reads schema files from disk.
#
# Notes:      The corpus is expected to live one
#             directory below the schemas directory,
#             e.g. schemas/corpus/ alongside
#             schemas/translate.yaml. This function
#             walks up one level from the corpus
#             directory to find the schema.
#             Both .yaml and .yml extensions are
#             tried, in that order.
# --------------------------------------------------
sub _infer_module_from_schema {
	my ($corpus_file, $method) = @_;

	my (undef, $corpus_dir) = File::Spec->splitpath($corpus_file);

	# Walk up one directory from corpus/ to reach the schemas/ dir
	my $schema_dir = File::Spec->catdir($corpus_dir, File::Spec->updir());

	for my $ext (qw(yaml yml)) {
		my $schema_file = File::Spec->catfile($schema_dir, "$method.$ext");
		next unless -f $schema_file;

		open(my $fh, '<', $schema_file) or next;
		while(<$fh>) {
			if(/^module:\s*(\S+)/) {
				close $fh;
				return $1;
			}
		}
		close $fh;
	}

	return undef;
}

__END__

=head1 REPLAY CORPUS WORKFLOW

=head2 Step 1: discover bugs with extract-schemas

    extract-schemas --fuzz lib/MyModule.pm

This runs coverage-guided fuzzing and writes any discovered bugs to
C<schemas/corpus/method.json>.

=head2 Step 2: generate a regression test file

    fuzz-harness-generator --replay-corpus schemas/corpus/ -o t/fuzz_replay.t

This reads every C<*.json> file in C<schemas/corpus/>, extracts the recorded
bugs, and generates a C<t/fuzz_replay.t> that calls each buggy input and
expects it B<not> to die.

=head2 Step 3: run the regression tests

    prove -l t/fuzz_replay.t

Tests will be red for each unfixed bug. Fix the underlying code, re-run
C<prove>, and the tests go green.

=head2 Step 4: commit both files

Commit C<schemas/corpus/> and C<t/fuzz_replay.t> to version control. The
corpus ensures future fuzz runs build on past discoveries; the replay test
ensures fixed bugs stay fixed.

=head2 Keeping the replay file up to date

Re-run C<fuzz-harness-generator --replay-corpus> whenever new bugs are
discovered or old ones are fixed and removed from the corpus. The generated
file should not be edited by hand.

=head1 SEE ALSO

L<App::Test::Generator>, L<App::Test::Generator::CoverageGuidedFuzzer>,
L<extract-schemas>

=head1 AUTHOR

Nigel Horne

=cut
