package Bioinfo::App::Cmd::Blast::Cmd::ParseXML;
use Modern::Perl;
use Moo;
use MooX::Cmd;
use MooX::Options prefer_commandline => 1;
use IO::All;
use XML::Twig;

our $VERSION = '0.1.14'; # VERSION:
# ABSTRACT: parse XML file of blast+(with outfmt=5) into tabular format


option input => (
  is  => 'ro',
  required  => 1,
  format  => 's',
  short => 'i',
  doc => 'a file of xml format generated by blast+'
);


option output => (
  is => 'ro',
  format => 's',
  short => 'o',
  doc => 'the outfile name of outfmt6',
);

my $out_handle;
my $out_handle_s;


sub execute {
  my ($self, $args_ref, $chain_ref) = @_;
  $self->options_usage unless (@$args_ref);
  my $input = $self->input;
  $out_handle = io($self->output);
  $out_handle_s = io($self->output . ".m8");
  my $twig = XML::Twig->new(
    twig_handlers => {
      Iteration => \&_iteration,
    }
  );
  $twig->parsefile($input);
  say "finished to parse $input to $out_handle";
}


sub _iteration {
  my ($self, $iteration) = @_;

  # iteration's content
  my $query_name = $iteration->first_child_text("Iteration_query-ID");
  my @hits = $iteration->descendants("Hit");
  for my $hit (@hits) {

    # hit's info
    my $hit_name = $hit->first_child_text("Hit_id");
    my $hit_def = $hit->first_child_text("Hit_def");
    my @hsps = $hit->descendants("Hsp");
    for my $hsp (@hsps) {

      # hsp's information
      my $score = $hsp->first_child_text("Hsp_bit-score");
      my $evalue = $hsp->first_child_text("Hsp_evalue");
      my $q_start = $hsp->first_child_text("Hsp_query-from");
      my $q_end = $hsp->first_child_text("Hsp_query-to");
      my $s_start = $hsp->first_child_text("Hsp_hit-from");
      my $s_end = $hsp->first_child_text("Hsp_hit-to");
      my $identity = $hsp->first_child_text("Hsp_identity");
      my $gaps = $hsp->first_child_text("Hsp_gaps");
      my $len = $hsp->first_child_text("Hsp_align-len");
      my $mismatches = $len - $identity - $gaps;
      $score = int($score);
      $identity = sprintf("%.3f", $identity / $len * 100, 3);
      $query_name =~s/(.+?)\s+(.+)/$1/;
      my @cols = ($query_name, $hit_name, $identity, $len, $mismatches, $gaps, $q_start, $q_end, $s_start, $s_end, $evalue, $score, $hit_def);
      my $line = join("\t", @cols) . "\n";
      $out_handle->print($line);
      pop (@cols);
      $line = join("\t", @cols) . "\n";
      $out_handle_s->print($line);
    }
  }
  $iteration->purge;
}


1;

__END__

=pod

=encoding UTF-8

=head1 NAME

Bioinfo::App::Cmd::Blast::Cmd::ParseXML - parse XML file of blast+(with outfmt=5) into tabular format

=head1 VERSION

version 0.1.14

=head1 SYNOPSIS

  use Bioinfo::App::Cmd::Blast::Cmd::ParseXML;
  Bioinfo::App::Cmd::Blast::Cmd::ParseXML->new_with_cmd;
  ...

=head1 DESCRIPTION

parse XML file of blast+(with outfmt=5) into tabular format.
In addition, the definition of a sequence is added to the last
column of each row, as which is usually important to our job.

=head1 ATTRIBUTES

=head2 input

a file of xml format generated by blast+

=head2 output

the outfile name of tabular format(similar to the result of blast+ with parameter outfmt=6.

=head1 METHODS

=head2 execute

=head2 _iteration

the iteration routine used by Twig;

=head1 AUTHOR

Yan Xueqing <yanxueqing621@163.com>

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2017 by Yan Xueqing.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut
