#===========================================================================

package Sitescooper::StoryURLProcessor;

use Sitescooper::URLProcessor;
use Carp;

use strict;

use vars qw(
		@ISA 
	);

@ISA = qw(Sitescooper::URLProcessor);

sub new {
  my $class = shift; $class = ref($class) || $class;
  my ($scoop, $robot, $scf, $ref, $url, $is_dynamic_html, $upindex) = @_;
  my $self = $class->SUPER::new($scoop, $robot, $scf, $ref, $url);

  $self->{is_dynamic_html} = $is_dynamic_html;
  $self->{upindex} = $upindex;

  $self->{origpage} = undef;
  $self->{output_storyurl_dbg} = { };

  # bless ($self, $class);
  $self;
}

sub start_get {
  my $self = shift;
  my $url = $self->{url};
  my $is_dynamic_html = $self->{is_dynamic_html};
  my $upindex = $self->{upindex};

  if ($self->get_state() != $Sitescooper::URLProcessor::STATE_PRE_GET) {
    croak ("state != STATE_PRE_GET");
  }

  my $fullurl = $url; $url = Sitescooper::Util::URLWithoutAnchor ($url);
  study $url;

  my $cacheflag = $self->{scf}->get_story_param ('story_cacheable', $url);
  if (defined $cacheflag) {
    # user setting overrides our heuristics
    $is_dynamic_html = ($cacheflag==0);
  }
  if (defined $self->{scf}->{story_diff} && $self->{scf}->{story_diff}) {
    $is_dynamic_html = 1;	# diff pages are always dynamic
  }

  my $limitto = $self->{scf}->get_story_param ('story_limit_to', $url);
  if (!defined $limitto) { $limitto = $self->{scf}->{def_story_limit_to}; }

  if (defined $limitto) {
    if (!$self->match_url ($url, $limitto)) {
      if (!defined $self->{output_storyurl_dbg}{$self->{url}}) {
	$self->{scoop}->dbg ("StoryURL for $self->{url}: $limitto");
	$self->{output_storyurl_dbg}{$self->{url}} = 1;
      }
      $self->{scoop}->dbg ("Non-story URL ignored: $fullurl");
      return;
    }
  }

  if ($url =~ m,^(ftp|mailto|https|gopher|pnm)://,) {
    $self->{scoop}->dbg ("Non-story URL ignored (bad protocol): $fullurl");
    return;
  }

  my $newurl = $self->apply_url_preproc($url);

  if (!defined $newurl) {
    $self->{scoop}->dbg ("URLProcess says URL should be ignored: $fullurl"); return;
  } elsif ($newurl ne $url) {
    $fullurl = $newurl; $url = Sitescooper::Util::URLWithoutAnchor ($newurl);
    $self->{url} = $url;
  }

  return if (defined $self->{robot}->{already_tried_download}->{$url});
  $self->{robot}->{already_tried_download}->{$url} = 1;

  my $pat = $self->{scf}->get_story_param ('story_skip', $url);
  if (defined $pat) {
    if ($url =~ m#^${pat}$#) {
      $self->{scoop}->verbose ("Skipping: $fullurl"); return;
    }
  }

  my $lastmod = $self->{cache}->get_last_modtime($url);

  if (!$is_dynamic_html
    && !$self->{cf}->{refresh}
    && defined ($lastmod))
  {
    $self->{scoop}->dbg ("skipping, already seen: $fullurl");
    return;
  }

  if ($self->{robot}->{hit_file_size_limit}) { return; }

  $self->{scoop}->verbose ("Reading: $url");
  return unless ($self->get_page ($url, $is_dynamic_html));
  $self->{is_dynamic_html} = $is_dynamic_html;
  1;
}

# ---------------------------------------------------------------------------

sub finish_get {
  my $self = shift;

  $self->set_state ($Sitescooper::URLProcessor::STATE_POST_GET);
  if ($Sitescooper::Main::got_intr_flag) { return; }

  my $url = $self->{url};
  my $is_dynamic_html = $self->{is_dynamic_html};
  my $upindex = $self->{upindex};

  my $origpage = $self->get_url_reply();

  if ($self->does_need_http_retry()) {
    my $handler = new Sitescooper::StoryURLProcessor ($self->{scoop},
        $self->{robot}, $self->{scf}, $self->{url},
	$self->get_http_retry_url(), $is_dynamic_html, $upindex);
    $self->{robot}->re_request_page ($handler);
    return;
  }

  return unless defined $origpage;

  if ($Sitescooper::Main::got_intr_flag) { return; }
  if ($self->{cf}->{linkslimit} > 0 && $self->{robot}->{stories_found} >= $self->{cf}->{linkslimit}) {
    $self->{scoop}->verbose ("over links limit, stopping this site.");
    $self->{robot}->{hit_file_size_limit} = 1;
    return;
  }

  my ($redir_from, $newbase);
  ($redir_from, $url, $newbase) = $self->handle_redirects_and_base_url
  					($url, $origpage);

  # get headline before stripping StoryStart and StoryEnd
  my $headline;
  $headline = $self->get_headline ($url, $origpage);

  if ($self->{robot}->{need_title} && $origpage =~ /<title>\s*(\S.*?)\s*<\/title>/is) {
    $self->{robot}->change_title ($1);
  }

  $self->{scoop}->journal ("url", $url);
  $self->{scoop}->journal ("base_url", $newbase);
  $self->{scoop}->journal ("pre_strip_story", $origpage);
  my $page = $self->strip_story ($url, $origpage);
  $self->{scoop}->journal ("post_strip_story", $page);

  $page = $self->apply_story_preprocs ($url, $page);

  my $cobj = undef;
  my $cachediffhtml = undef;
  my $isdiff = (defined $self->{scf}->{story_diff} && $self->{scf}->{story_diff});

  if ($isdiff) {
    my $diffobj = $self->{cache}->get_cached_page_for_diff ($url);
    if (defined $diffobj) {
      $cachediffhtml = $diffobj->get_page();
    }

  } elsif ($is_dynamic_html) { 
    $cobj = $self->{cache}->get_cached_page ($url);
  }

  if (defined $cachediffhtml) {
    $cachediffhtml = $self->strip_story_silently ($url, $cachediffhtml);
    $cachediffhtml = $self->apply_story_preprocs ($url, $cachediffhtml);
  }

  if ($isdiff) {
    $page = $self->get_new_bits ($cachediffhtml, $page);
    $self->{cache}->cache_page_at_commit ($url, $redir_from, $origpage);
  } else {
    $self->{cache}->cache_page_now ($url, $redir_from, $origpage);
  }

  if ($Sitescooper::Main::got_intr_flag) { return; }

  if (defined fileno Sitescooper::Main::JOURNAL) {
    # always write a text-mode version for the journal
    $self->{scoop}->journal ("to_text_story",
    	$self->html_to_text ($url, $page, $Sitescooper::Main::OUT_TEXT));
  }

  # get turn-over links after stripping StoryStart and StoryEnd,
  # but before converting to text/whatever.

  my @turnoverlinks;
  @turnoverlinks = $self->get_story_turnover_links ($newbase, $page);
  $page = $self->html_to_text_warn_about_ext_links
			 ($url, $page, $self->{robot}->{cf}->{output_style});

  my $life = $self->{scf}->get_story_param ('story_lifetime', $url);
  my $mod = $self->{cache}->get_last_modtime ($url);
  if (defined $mod && $mod < $life * 24 * 60 * 60) {
    $self->{scoop}->verbose ("Skipping (story is older than ".$life." days): $url");
    return;
  }

  if ($is_dynamic_html && defined $cobj && !$self->{cf}->{refresh}
  	&& !$self->{scf}->{image_only_site})
  {
    # ensure that the cleaned-up HTML doesn't match the cleaned-up cached
    # HTML. Sometimes the ad banners will be the only things that have
    # changed between retrieves, and html_to_text will have stripped those
    # out.
    if ($cobj->text_matches_cached_text ($url, $self, $page)) {
      $self->{scoop}->verbose ("Skipping (text has not changed): $url");
      return;
    }
  }

  my $grepmode = $self->{scf}->get_story_param ('grep', $url);
  if ($grepmode) {
    my $ignprof = $self->{scf}->get_story_param ('ignore_profiles', $url);
    if (!defined $ignprof || $ignprof == 0) {
      my $nhscore;
      if (($nhscore = $self->MatchNewsHoundProfiles
				  ($url, $headline, $page)) eq '')
      {
	$self->{scoop}->verbose ("Skipping (page does not match profiles): $url");
	return;
      }

      #$page = $nhscore.$page;		# TODO??
    }
  }

  # ensure there's some alphanumerics in the output text. No alnums means
  # no output. HTML needs to be checked to ensure we don't just pick
  # up tags which will not be displayed. Added kld's check for image-only
  # sites.
  {
    my $gottext = 1;

    if ($self->{scf}->{image_only_site} == 1) {
      $self->{scoop}->dbg ("image-only site, not checking if text is present");

    } elsif ($self->writing_html) {
      if ($page !~ /[A-Za-z0-9"']\s*</
		&& $page !~ />\s*[A-Za-z0-9"']/
		&& $page !~ /^\s*[A-Za-z0-9"']/)
      { $gottext = 0; }

    } else {
      if ($page !~ /[A-Za-z0-9"']/) { $gottext = 0; }
    }

    if ($gottext == 0) {
      $self->{scoop}->verbose ("Skipping (no text to write): $url");
      return;
    }
  }

  $self->{robot}->add_snarfed_link ($url);
  if ($self->{scf}->{levels} < 0) {
    # this is a one-level site: therefore the story should be treated
    # as the "front page". Thx Carsten for this one.
    $self->{robot}->write_as_story (1, $url, $page, $headline, $upindex);
  } else {
    $self->{robot}->write_as_story (0, $url, $page, $headline, $upindex);
  }

  $self->{scoop}->journal ("turnover_links_story", join ("\n", @turnoverlinks));
  if ($#turnoverlinks >= 0) {
    my $link;

    @turnoverlinks = $self->{robot}->absolutify_and_grep_unseen_urls
					 ($newbase, @turnoverlinks);
    for $link (@turnoverlinks)
    {
      if ($Sitescooper::Main::got_intr_flag) { return; }
      $link = Sitescooper::Util::AbsoluteURL ($newbase, $link);
      $self->{robot}->download_story_page ($url, $link, 0, $upindex);	# right now
    }
  }
}

# ---------------------------------------------------------------------------

sub apply_story_preprocs {
  my ($self, $url, $page) = @_;
  local ($_);

  my $bits = $self->{scf}->get_story_param ('story_html_header', $url);
  if (defined $bits) { $page = $bits . $page; }
  $bits = $self->{scf}->get_story_param ('story_html_footer', $url);
  if (defined $bits) { $page .= $bits; }

  my $proc = $self->{scf}->get_story_param ('story_preproc', $url);
  if (defined $proc) {
    $_ = $page;
    my $site_level = 1;
    if (!eval $proc."; 1;") {
      $self->sitewarn ("StoryHTMLPreProc failed: $@");
      # and keep the original $page
    } else {
      $page = $_;
      $self->{scoop}->journal ("post_story_preproc", $page);
    }
  }
  return $page;
}

# ---------------------------------------------------------------------------

sub MatchNewsHoundProfiles {
  my ($self, $url, $headline, $bodyText) = @_;

  $headline ||= '';

  foreach (@{$self->{scoop}->{profiles}}) {
    my ($score);

    #print "Checking story $headline against profile $_->{name}: ";
    $score = Sitescooper::NewsHound::ScoreStory($_, $bodyText);

    #print "Score $score, needed $_->{score}\n";
    if ($score >= $_->{score}) {
      return "Scored $score on profile $_->{name}\n";
    }
  }
  # if we don't match any profile return null string
  return '';
}

# ---------------------------------------------------------------------------

sub strip_story {
  my ($self, $url, $page) = @_;
  $self->strip_html ($url, "Story", undef, 1, $page);
}

sub strip_story_silently {
  my ($self, $url, $page) = @_;
  $self->strip_html ($url, "Story", undef, 0, $page);
}

# ---------------------------------------------------------------------------

sub get_headline {
  my $self = shift;
  my $url = shift;
  my $page = shift;

  my $headline;

  if (defined ${$self->{robot}->{url_title}}{$url}) {
    $headline = $self->html_to_text ($url,
    		${$self->{robot}->{url_title}}{$url}, $Sitescooper::Main::OUT_TEXT);
    $self->{scoop}->dbg ("StoryHeadline: (from RDF): $headline");

  } else {
    my $pat = $self->{scf}->get_story_param ('head_pat', $url);
    if (defined $pat) {
      if ($page !~ m#${pat}#m) {
	$self->sitewarn ("StoryHeadline pattern \"$pat\" not found in page $url\n");
      } elsif (defined $1) {
	$headline = $self->html_to_text ($url, $1, $Sitescooper::Main::OUT_TEXT);
	# $self->{scoop}->dbg ("StoryHeadline: $headline"); # logged later on anyway
      } else {
	$self->sitewarn ("StoryHeadline pattern \"$pat\" contains no brackets!\n");
      }

    } elsif ($page =~ m#<meta name="PCTITLE" content="\s*(.*?)\s*">#mi) {
      # try a fallback: search for PointCast headline tags
      $headline = $self->html_to_text ($url, $1, $Sitescooper::Main::OUT_TEXT);
      $self->{scoop}->dbg ("StoryHeadline (default, PointCast): $headline");

    } elsif ($page =~ m#<TITLE>\s*(.*?)\s*</TITLE>#mi) {
      # try a fallback: search for TITLE tag
      $headline = $self->html_to_text ($url, $1, $Sitescooper::Main::OUT_TEXT);
      $self->{scoop}->dbg ("StoryHeadline (default, TITLE tag): $headline");
    }
  }

  $headline;
}

# ---------------------------------------------------------------------------

sub get_story_turnover_links {
  my $self = shift;
  my $url = shift;
  my $page = shift;

  my @turnoverlinks = ();
  my $followlinks = $self->{scf}->get_story_param
  				('story_follow_links', $url);

  while (1) {
    if ($self->{robot}->{hit_file_size_limit}) { last; }

    if (
      $page =~ s/<a\s+[^>]*href\s*=\s*\"([^\">]+)\"[^>]*>(.+?)<\/a>//is
      ||
      $page =~ s/<a\s+[^>]*href\s*=\s*\'([^\'>]+)\'[^>]*>(.+?)<\/a>//is
      ||
      $page =~ s/<a\s+[^>]*href\s*=\s*([^\s>]+)[^>]*>(.+?)<\/a>//is
      )
    {
      my $link = $1;
      my $txt = $2;

      $link =~ s/^(?:\"|\'|%22)*//; $link =~ s/(?:\"|\'|%22)*$//;
      if ($followlinks) {
	push (@turnoverlinks, $link);

      } elsif (!defined $followlinks && $txt =~ m,(more|next|\d+ of \d+|&gt;&gt;),i)
      {
	# only follow More links automatically if StoryFollowLinks is unset

	my $urlguts = '.';
	($url =~ /^http:\/\/\S+\.([^\.\/]+\.[^\.\/]+\/).*$/) and
	    ($urlguts = $1);
	
	if (($txt !~ /[a-z0-9] [a-z0-9]+ [a-z0-9]+ [a-z0-9]/i) # 5 or more words
	    && (length ($txt) < 15)
	    && $link =~ m/$urlguts/)
	{
	  push (@turnoverlinks, $link);
	  $txt =~ s/[\n\r]+/ /g;
	  $self->{scoop}->verbose ("(Following 'next page' link: \"$txt\")");
	}
      }
      next;
    }

    last;		# no more links available
  }

  @turnoverlinks;
}

# ---------------------------------------------------------------------------

1;
