#!/usr/bin/env perl 
# ABSTRACT: Get sequences by name (also using lists)
# PODNAME: fu-extract
use 5.012;
use warnings;
use Getopt::Long;
use FindBin qw($RealBin);
use FASTX::Reader;

use File::Basename;
use Data::Dumper;
my $BASENAME = basename($0);
use Proch::Seqfu;
my $VERSION = $Proch::Seqfu::VERSION // "<Dev>";

my ($opt_list, $opt_pattern, $opt_maxlen, $opt_minlen, $opt_verbose, $opt_debug);
my $opt_separator = "\t";
my $opt_header_char = '#';
my $opt_column    = 1;
my %seq_names     = ();
my $opt_case_insensitive = 0;
my $_opt          = GetOptions(
	'l|list=s'      => \$opt_list,
	'p|pattern=s'   => \$opt_pattern,
	'x|maxlen=i'    => \$opt_maxlen,
	'm|minlen=i'    => \$opt_minlen,
	'c|column=i'    => \$opt_column,
	'i|case-insensitive' => \$opt_case_insensitive,
	'h|header=s'    => \$opt_header_char,
	'v|verbose'     => \$opt_verbose,
	'd|debug'		=> \$opt_debug,
	's|separator=s' => \$opt_separator,
);

if (defined $opt_list) {
	open(my $I, '<', "$opt_list") || die " FATAL ERROR:\n Unable to read list file <$opt_list>.\n";
	while (my $line = readline($I) ) {
		chomp($line);
		next if ($line =~/^$opt_header_char/);
		my @fields = split /$opt_separator/, $line;
		my $name = $fields[$opt_column - 1];
		if ($opt_case_insensitive) {
			$name = lc($name);
		}
		$seq_names{ $name } = 1;
	}
	say STDERR Dumper \%seq_names if $opt_debug;
}

usage() unless (defined $ARGV[0]);

for my $file (@ARGV) {
	next unless (-e "$file");
	vprint(" - Processing $file");
	my $Fasta = FASTX::Reader->new( { filename => "$file"});
	my $tot   = 0;
	my $pass  = 0;
	while (my $seq = $Fasta->getRead() ) {
		$tot++;
		my $l = length( $seq->{seq} );
		my $seq_name = $opt_case_insensitive ? lc($seq->{name}) : $seq->{name};
		next if (defined $opt_maxlen and $l > $opt_maxlen);
		next if (defined $opt_minlen and $l < $opt_minlen);
		next if (defined $opt_pattern and $seq_name !~/$opt_pattern/);
		next if (defined $opt_list and not $seq_names{ $seq_name });
		my $comment = $seq->{comment} ? " $seq->{comment}" : '';

		say '>', $seq->{name}, $comment, "\n", $seq->{seq};
	}
}

sub usage {
	say STDERR<<END;
	Filter DNA sequences by pattern and/or length. Version $VERSION

	Usage:
	$BASENAME [options] InputFile.fa [...]

	-p, --pattern   STRING (searched in sequence names)
	-m, --minlen    INT
	-x, --maxlen    INT
	-l, --list      FILE
	-c, --column    INT (default: 1)
	-s, --separator CHAR (default: "\\t")
	-h, --header    CHAR (defatul: "#")

	Note that "-p" and "-l" are exclusive

	example:
	$BASENAME -p 'BamHI' test.fa

	$BASENAME -l list.txt test.Fasta
END
	exit 0;
}

sub vprint {
	say $_[0] if ($opt_verbose);
}

__END__

=pod

=encoding UTF-8

=head1 NAME

fu-extract - Get sequences by name (also using lists)

=head1 VERSION

version 1.5.0

=head1 PARAMETERS

=over 4

=item C<-p, --pattern> PATTERN

Print only sequences containing the given pattern in their name

=item C<-l, --list> FILE

Print only sequences in the given list file (full name must match)

=item C<-c, --column> COLUMN

In the list file, consider the name as the column COLUMN (default: 1)

=item C<-h, --header> CHAR

Ignore lines starting with CHAR in the list (default: "#")

=item C<-s, --separator> CHAR

Split the lines in the list file by CHAR (default: "\\t")

=item C<-i, --case-insensitive>

Ignore case in the name	(works both with C<-p> and C<-l>)

=item C<-m, --minlen> MINLEN

Print only sequences with a length greater than MINLEN

=item C<-x, --maxlen> MAXLEN

Print only sequences with a length less than MAXLEN

=item C<-v, --verbose>

Print more information

=back

=head1 EXAMPLES

Search by sequence name:

  fu-extract -p 'BamHI' test.fa

Use a list to extract sequences:

  fu-extract -l list.txt test.Fasta

=head1 MODERN ALTERNATIVE

This suite of tools has been superseded by B<SeqFu>, a compiled
program providing faster and safer tools for sequence analysis.
This suite is maintained for the higher portability of Perl scripts
under certain circumstances.

SeqFu is available at L<https://github.com/telatin/seqfu2>, and
can be installed with BioConda C<conda install -c bioconda seqfu>

=head1 CITING

Telatin A, Fariselli P, Birolo G.
I<SeqFu: A Suite of Utilities for the Robust and Reproducible Manipulation of Sequence Files>.
Bioengineering 2021, 8, 59. L<https://doi.org/10.3390/bioengineering8050059>

=cut

=head1 AUTHOR

Andrea Telatin <andrea@telatin.com>

=head1 COPYRIGHT AND LICENSE

This software is Copyright (c) 2018-2022 by Andrea Telatin.

This is free software, licensed under:

  The MIT (X11) License

=cut
