Skip site navigation (1)Skip section navigation (2)

FreeBSD Manual Pages

  
 
  

home | help
KinoSearch1::Docs::TutUserlContributed Perl DocuKinoSearch1::Docs::Tutorial(3)

NAME
       KinoSearch1::Docs::Tutorial - sample indexing and search	applications

DESCRIPTION
       The following sample code for invindexer.plx and	search.cgi can be used
       to create a simple search engine. It requires the html presentation of
       the US Constitution included in the distribution	for KinoSearch1, under
       "t/us_constitution".

       Note that a proper indexer for html documents would not rely on quick-
       n-dirty regular expressions for stripping tags, as this one does	for
       the sake	of brevity -- it would use a dedicated parsing module such as
       HTML::Parser.

   invindexer.plx
	   #!/usr/bin/perl
	   use strict;
	   use warnings;

	   use File::Spec;
	   use KinoSearch1::InvIndexer;
	   use KinoSearch1::Analysis::PolyAnalyzer;

	   ### In order	for invindexer.plx to work correctly, you must modify
	   ### $source_dir, $path_to_invindex, and possibly $base_url.
	   ###
	   ### $source_dir must	lead to	the directory containing the US
	   ### Constitution html files.
	   ###
	   ### $path_to_invindex is the	future location	of the invindex.
	   ###
	   ### $base_url should	reflect	the location of	the us_constitution directory
	   ### when accessed via a web browser.
	   my $source_dir	= '';
	   my $path_to_invindex	= '';
	   my $base_url		= '/us_constitution';

	   opendir( my $source_dh, $source_dir )
	       or die "Couldn't	opendir	'$source_dir': $!";
	   my @filenames = grep	{/\.html/} readdir $source_dh;
	   closedir $source_dh or die "Couldn't	closedir '$source_dir':	$!";

	   ### STEP 1: Choose an Analyzer.
	   my $analyzer	= KinoSearch1::Analysis::PolyAnalyzer->new(
	       language	=> 'en',
	   );

	   ### STEP 2: Create a	InvIndexer object.
	   my $invindexer = KinoSearch1::InvIndexer->new(
	       analyzer	=> $analyzer,
	       invindex	=> $path_to_invindex,
	       create	=> 1,
	   );

	   ### STEP 3: Define fields.
	   $invindexer->spec_field( name => 'title' );
	   $invindexer->spec_field(
	       name	  => 'bodytext',
	       vectorized => 1,
	   );
	   $invindexer->spec_field(
	       name    => 'url',
	       indexed => 0,
	   );

	   foreach my $filename	(@filenames) {
	       next if $filename eq 'index.html';
	       my $filepath = File::Spec->catfile( $source_dir,	$filename );
	       open( my	$fh, '<', $filepath )
		   or die "couldn't open file '$filepath': $!";
	       my $content = do	{ local	$/; <$fh> };

	       ### STEP	4: Start a new document.
	       my $doc = $invindexer->new_doc;

	       $content	=~ m#<title>(.*?)</title>#s
		   or die "couldn't isolate title in '$filepath'";
	       my $title = $1;
	       $content	=~ m#<div id="bodytext">(.*?)</div><!--bodytext-->#s
		   or die "couldn't isolate bodytext in	'$filepath'";
	       my $bodytext = $1;
	       $bodytext =~ s/<.*?>/ /gsm;    #	quick and dirty	tag stripping

	       ### STEP	5: Set the value for each field.
	       $doc->set_value(	url	 => "$base_url/$filename" );
	       $doc->set_value(	title	 => $title );
	       $doc->set_value(	bodytext => $bodytext );

	       ### STEP	6 Add the document to the invindex.
	       $invindexer->add_doc($doc);

	       ### STEP	7 Repeat steps 3-5 for each document in	the collection.
	   }

	   ### STEP 8 Finalize the invindex.
	   $invindexer->finish;

   search.cgi
	   #!/usr/bin/perl -T
	   use strict;
	   use warnings;

	   use CGI;
	   use List::Util qw( max min );
	   use POSIX qw( ceil );
	   use KinoSearch1::Searcher;
	   use KinoSearch1::Analysis::PolyAnalyzer;
	   use KinoSearch1::Highlight::Highlighter;

	   my $cgi	     = CGI->new;
	   my $q	     = $cgi->param('q');
	   my $offset	     = $cgi->param('offset');
	   my $hits_per_page = 10;
	   $q	   = ''	unless defined $q;
	   $offset = 0	unless defined $offset;

	   ### In order	for search.cgi to work,	$path_to_invindex must be modified so
	   ### that it points to the invindex created by invindexer.plx, and
	   ### $base_url may have to change to reflect where a web-browser should
	   ### look for	the us_constitution directory.
	   my $path_to_invindex	= '';
	   my $base_url		= '/us_constitution';

	   ### STEP 1: Specify the same	Analyzer used to create	the invindex.
	   my $analyzer	= KinoSearch1::Analysis::PolyAnalyzer->new(
	       language	=> 'en',
	   );

	   ### STEP 2: Create a	Searcher object.
	   my $searcher	= KinoSearch1::Searcher->new(
	       invindex	=> $path_to_invindex,
	       analyzer	=> $analyzer,
	   );

	   ### STEP 3: Feed a query to the Search object.
	   my $hits = $searcher->search($q);

	   ### STEP 4: Arrange for highlighted excerpts	to be created.
	   my $highlighter = KinoSearch1::Highlight::Highlighter->new(
	       excerpt_field =>	'bodytext' );
	   $hits->create_excerpts( highlighter => $highlighter );

	   ### STEP 5: Process the search.
	   $hits->seek(	$offset, $hits_per_page	);

	   ### STEP 6: Format the results however you like.

	   # create result list
	   my $report =	'';
	   while ( my $hit = $hits->fetch_hit_hashref )	{
	       my $score = sprintf( "%0.3f", $hit->{score} );
	       $report .= qq|
		   <p>
		       <a href="$hit->{url}"><strong>$hit->{title}</strong></a>
		       <em>$score</em>
		       <br>
		       $hit->{excerpt}
		       <br>
		       <span class="excerptURL">$hit->{url}</span>
		   </p>
		   |;
	   }

	   $q =	CGI::escapeHTML($q);

	   # display info about	the number of hits, paging links
	   my $total_hits = $hits->total_hits;
	   my $num_hits_info;
	   if (	!length	$q ) {
	       # no query, no display
	       $num_hits_info =	'';
	   }
	   elsif ( $total_hits == 0 ) {
	       # alert the user	that their search failed
	       $num_hits_info =	qq|<p>No matches for <strong>$q</strong></p>|;
	   }
	   else	{
	       # calculate the nums for	the first and last hit to display
	       my $last_result = min( (	$offset	+ $hits_per_page ), $total_hits	);
	       my $first_result	= min( ( $offset + 1 ),	$last_result );

	       # display the result nums, start	paging info
	       $num_hits_info =	qq|
		   <p>
		       Results <strong>$first_result-$last_result</strong>
		       of <strong>$total_hits</strong> for <strong>$q</strong>.
		   </p>
		   <p>
		       Results Page:
		   |;

	       # calculate first and last hits pages to	display	/ link to
	       my $current_page	= int( $first_result / $hits_per_page )	+ 1;
	       my $last_page	= ceil(	$total_hits / $hits_per_page );
	       my $first_page	= max( 1, ( $current_page - 9 )	);
	       $last_page = min( $last_page, ( $current_page + 10 ) );

	       # create	a url for use in paging	links
	       my $href	= $cgi->url( -relative => 1 ) .	"?" . $cgi->query_string;
	       $href .=	";offset=0" unless $href =~ /offset=/;

	       # generate the "Prev" link;
	       if ( $current_page > 1 )	{
		   my $new_offset = ( $current_page - 2	) * $hits_per_page;
		   $href =~ s/(?<=offset=)\d+/$new_offset/;
		   $num_hits_info .= qq|<a href="$href">&lt;= Prev</a>\n|;
	       }

	       # generate paging links
	       for my $page_num	( $first_page .. $last_page ) {
		   if (	$page_num == $current_page ) {
		       $num_hits_info .= qq|$page_num \n|;
		   }
		   else	{
		       my $new_offset =	( $page_num - 1	) * $hits_per_page;
		       $href =~	s/(?<=offset=)\d+/$new_offset/;
		       $num_hits_info .= qq|<a href="$href">$page_num</a>\n|;
		   }
	       }

	       # generate the "Next" link
	       if ( $current_page != $last_page	) {
		   my $new_offset = $current_page * $hits_per_page;
		   $href =~ s/(?<=offset=)\d+/$new_offset/;
		   $num_hits_info .= qq|<a href="$href">Next =&gt;</a>\n|;
	       }

	       # finish	paging links
	       $num_hits_info .= "</p>\n";
	   }

	   # blast it all out
	   print "Content-type:	text/html\n\n";
	   print <<END_HTML;
	   <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01	Transitional//EN"
	       "http://www.w3.org/TR/html4/loose.dtd">
	   <html>
	   <head>
	       <meta http-equiv="Content-type"
		   content="text/html;charset=ISO-8859-1">
	       <link rel="stylesheet" type="text/css" href="$base_url/uscon.css">
	       <title>KinoSearch: $q</title>
	   </head>

	   <body>

	       <div id="navigation">
		   <form id="usconSearch" action="">
		       <strong>
		       Search the <a href="$base_url/index.html">US Constitution</a>:
		       </strong>
		       <input type="text" name="q" id="q" value="$q">
		       <input type="submit" value="=&gt;">
		       <input type="hidden" name="offset" value="0">
		   </form>
	       </div><!--navigation-->

	       <div id="bodytext">

	       $report

	       $num_hits_info

	       <p style="font-size: smaller; color: #666">
		   <em>Powered by
		       <a href="http://www.rectangular.com/kinosearch/">
			   KinoSearch
		       </a>
		   </em>
	       </p>
	       </div><!--bodytext-->

	   </body>

	   </html>
	   END_HTML

COPYRIGHT
       Copyright 2005-2010 Marvin Humphrey

LICENSE, DISCLAIMER, BUGS, etc.
       See KinoSearch1 version 1.01.

perl v5.32.0			  2020-08-09	KinoSearch1::Docs::Tutorial(3)

NAME | DESCRIPTION | COPYRIGHT | LICENSE, DISCLAIMER, BUGS, etc.

Want to link to this manual page? Use this URL:
<https://www.freebsd.org/cgi/man.cgi?query=KinoSearch1::Docs::Tutorial&sektion=3&manpath=FreeBSD+12.2-RELEASE+and+Ports>

home | help