[Gnucash-changes] paginated result support added, limit raised to 10 archives

Sun May 9 08:13:23 EDT 2004

Log Message:
-----------
paginated result support added, limit raised to 10 archives

Modified Files:
--------------
    mail-search:
        search-tips.html
        search.php

Revision Data
-------------
Index: search.php
===================================================================
RCS file: /home/cvs/cvsroot/mail-search/search.php,v
retrieving revision 1.7
retrieving revision 1.8
diff -Lsearch.php -Lsearch.php -u -r1.7 -r1.8

--- search.php
+++ search.php
@@ -155,7 +155,16 @@
 # OMIT the final / from $base!
 $base = "/var/mailman/archives/public";
 $default_list = "gnucash-user";
-$default_length = 4; /* default time period to search - low for testing*/
+$default_length = 2; 	/* default time period to search */
+$max_length = 10;	/* maximum number of archives to search */
+$perpage = 20;		/* number of hits per page of results */
+/* resourcelimit - when a search produces few results, limit the total workload
+Common searches will be paginated by $perpage. Searches that only turn up
+a few hits per archive can cause timeout errors in PHP.
+Paginate these results according to every $resourcelimit number of files
+accessed - this is a calibration of server performance.
+Experimentation is recommended.  */
+$resourcelimit = 700;
 ########### end configuration #################
 
 /* termlist is the URL encoded string of search values*/
@@ -188,7 +197,7 @@
 $startmonth =	$_GET{"month"};   /* uses current month if empty */
 $startyear =	$_GET{"year"};    /* uses current year if empty */
 $period =	$_GET{"span"};    /* uses default_length if empty */
-
+$pager =	$_GET{"page"};
 /** useabstract - whether to use the quicker list or the longer abstract mode*/
 $useabstract = 	$_GET{"abstract"}; /* default is not to use abtracts */
 if($useabstract == "on") { $useabstract = 1; }
@@ -220,7 +229,8 @@
 }
 if($assert_list) { $list = $default_list; }
 
-if(!$period) { $period = $default_length; }
+if((!$period)||($period > $max_length)) { $period = $default_length; }
+if((!$pager)||($pager < 0))  { $pager = 0; }
 
 /* c is a private temporary variable, always reset c to zero before and after use */
 $c = 0;
@@ -339,8 +349,13 @@
 	print "Searches can continue into previous archives.<br>\n";
 	print "Select the number of months to include: ";
 	print "<select name=\"span\" size=\"1\">\n";
-	for($c=1;$c<=$period;$c++) {
-		print "<option value=\"$c\">$c</option>\n";
+	for($c=1;$c<=$max_length;$c++) {
+		if($c == $default_length) {
+			print "<option value=\"$c\" selected>$c</option>\n";
+		}
+		else {
+			print "<option value=\"$c\">$c</option>\n";
+		}
 	}
 	print "</select> (1 = only search one archive.)\n";
 ?>
@@ -396,6 +411,12 @@
 $startmonth = gmdate("n",$starttime);
 $startyear = gmdate("Y",$starttime);
 
+/* paging mechanism must preserve original, verified, conditions */
+$pagermonth =	$startmonth;
+$pageryear =	$startyear;
+$pagerlist =	$list;
+$pagerspan =	$period;
+
 if($period > 1) {
 	$period--;
 	if($startmonth < $period) {
@@ -455,6 +476,32 @@
 
 $self = $_SERVER["PHP_SELF"];
 if(!$self) { $self = "search.php"; }
+$dirlist = array();
+$paging = array();
+
+function get_abstract($snip) {
+	$abstract = "";
+	$snip = preg_replace("/&gt;/","",$snip);
+	$snip = preg_replace("/&lt;/","",$snip);
+	$snip = preg_replace("/<(.*)>/","",$snip);
+	$snip = preg_replace("/\n/"," ",$snip);
+	$snip = preg_replace("/-----BEGIN.PGP.SIGNED.MESSAGE-----.?Hash: SHA1/","",$snip);
+	$abstract = substr($snip,0,250);
+	$abstract = htmlentities($abstract);
+	if (!$abstract) {
+		$snip2 = array();
+		if (eregi("<pre>.*</pre>",$content,$snip2)) {
+			$snip = preg_replace("/<.*>/","",$snip2[1]);
+			$snip2 = array(); /* clear the rest */
+			$snip = preg_replace("/\n/","",$snip);
+			$snip = preg_replace("/[>\|]/","",$snip);
+			$snip = preg_replace("/-----BEGIN.PGP.SIGNED.MESSAGE-----.?Hash: SHA1/","",$snip);
+			$abstract = substr($snip,0,250);
+			$abstract = htmlentities($abstract);
+		}
+	}
+	return $abstract;
+}
 
 foreach($start as $folder) {
 	if(!($dp = @opendir("$base/$folder"))) {
@@ -472,109 +519,110 @@
 </body>
 </html>
 <?php
+		closedir($dp);
 		exit;
 	}
+	$order = array();
 	while($file = readdir($dp)) {
-		if(($file != '.')&&($file != '..')) {
-			if(ereg("date\.html",$file)) { continue; }
-			if(ereg("thread\.html",$file)) { continue; }
-			if(ereg("subject\.html",$file)) { continue; }
-			if(ereg("author\.html",$file)) { continue; }
-			if(ereg("index\.html",$file)) { continue; }
-			$file = "$folder/$file";
-			$temp = file("$base/$file");
-			$content = join('',$temp);
-			$d = 0;
-			if(eregi("<title>(.*)<\/title>",$content,$tag)) {
-				$title = htmlspecialchars($tag[1]);
-				$title = ereg_replace("&amp;amp;","&amp;",$title);
-			}
-			if(eregi("title=\"([- a-z0-9\,\.:\*\&\%\^\$\!]*)\"\;",
-			$content,$tag)) {
-				$title = htmlspecialchars($tag[1]);
+		if(ereg("[0-9]{6}\.html",$file)) {
+			array_push($order, $file);
+		}
+	}
+	closedir($dp);
+	sort($order); /* readdir() is not in message number order */
+	while(list(,$value) = each ($order)) {
+		array_push($dirlist, "$folder/$value");
+	}
+}
+
+$pagercount = count($dirlist);
+
+if($pager > 0) {
+	$pager = sprintf("%d",$pager);
+}
+$paging = array_slice($dirlist, $pager, $resourcelimit);
+$more = TRUE;
+if ((count($paging) + $pager) == count($dirlist)) { $more = FALSE; }
+$hits = 0;
+$tally = 0;
+
+while((list(,$file) = each ($paging))&&($hits < $perpage)) {
+	$snip = array();
+	$temp = file("$base/$file");
+	$content = join('',$temp);
+	$d = 0;
+	$tally++;
+	if(eregi("<title>(.*)<\/title>",$content,$tag)) {
+		$title = htmlspecialchars($tag[1]);
+		$title = ereg_replace("&amp;amp;","&amp;",$title);
+	}
+	if(eregi("title=\"([- a-z0-9\,\.:\*\&\%\^\$\!]*)\"\;",$content,$tag)) {
+		$title = htmlspecialchars($tag[1]); /*remove quote marks around message titles*/
+	}
+	$html = "";
+	$abstract = "";
+	$snip = array();
+	$html = preg_replace("/&gt;/","",$content);
+	$html = preg_replace("/&lt;/","",$html);
+	$content = preg_replace("/<a.*\/a>/i","",$html);
+	$snip = array();
+	eregi("<!--beginarticle-->(.*)<!--endarticle-->",$content,$snip);
+	if($snip[1]) { $content = $snip[1]; }
+	if($boolean == "AND") {
+		for($i=0;$i<$termscount;$i++) {
+			if($case == "sensitive") {
+				if(ereg($terms[$i],$content)) { $d++; }
 			}
-			$html = "";
-			$abstract = "";
-			$snip = array();
-			$html = preg_replace("/&gt;/","",$content);
-			$html = preg_replace("/&lt;/","",$html);
-			$html = preg_replace("/<a.*\/a>/i","",$html);
-			if(ereg("[0-9]{6}\.html",$file)) {
-				if (eregi("<!--beginarticle-->.*<!--endarticle-->",$content,$snip)) {
-					$snip = preg_replace("/&gt;/","",$snip);
-					$snip = preg_replace("/&lt;/","",$snip);
-					$snip[0] = preg_replace("/<(.*)>/","",$snip[0]);
-					$snip[0] = preg_replace("/\n/"," ",$snip[0]);
-					$snip[0] = preg_replace("/-----BEGIN.PGP.SIGNED.MESSAGE-----.?Hash: SHA1/","",$snip[0]);
-					$abstract = substr($snip[0],0,250);
-					$abstract = htmlentities($abstract);
-				}
-				if (!$abstract) {
-					$snip = array();
-					if (eregi("<pre>.*</pre>",$content,$snip)) {
-						$snip[0] = preg_replace("/<.*>/","",$snip[0]);
-						$snip[0] = preg_replace("/\n/","",$snip[0]);
-						$snip[0] = preg_replace("/[>\|]/","",$snip[0]);
-					$snip[0] = preg_replace("/-----BEGIN.PGP.SIGNED.MESSAGE-----.?Hash: SHA1/","",$snip[0]);
-						$abstract = substr($snip[0],0,250);
-						$abstract = htmlentities($abstract);
-					}
-				}
+			if($case == "insensitive") {
+				if(eregi($terms[$i],$content)) { $d++; }
 			}
-			$snip = array();
-			eregi("<!--beginarticle-->(.*)<!--endarticle-->",$content,$snip);
-			if($snip[0]) { $content = $snip[0]; }
-			if($boolean == "AND") {
-				for($i=0;$i<$termscount;$i++) {
-					if($case == "sensitive") {
-						if(ereg($terms[$i],$content)) { $d++; }
-					}
-					if($case == "insensitive") {
-						if(eregi($terms[$i],$content)) { $d++; }
+		}
+		if($d == $termscount) { /* wait until all terms match before making a hit */
+			if($useabstract) { $abstracts[$file] = get_abstract($content);}
+			$titles[$file] = $title;
+			$hits++;
+			continue;
+		}
+	}
+	if($boolean == "OR") {
+		if($case == "sensitive") {
+			for($i=0;$i<$termscount;$i++) {
+				if(ereg($terms[$i],$content)) {
+					if($useabstract) {
+						$abstracts[$file] = get_abstract($content);
 					}
-				}
-				if($d == $termscount) {
-					if($useabstract) { $abstracts[$file] = $abstract; }
 					$titles[$file] = $title;
+					$i = $termscount; /* ignore other terms, this is a hit */
+					$hits++;
+					continue;
 				}
 			}
-			if($boolean == "OR") {
-				if($case == "sensitive") {
-					for($i=0;$i<$termscount;$i++) {
-						if(ereg($terms[$i],$content)) {
-							if($useabstract) {
-								$abstracts[$file] = $abstract;
-							}
-							$titles[$file] = $title;
-							$i = $termscount;
-						}
-					}
-				}
-				if($case == "insensitive") {
-					for($i=0;$i<$termscount;$i++) {
-						if(eregi($terms[$i],$content)) {
-							if($useabstract) {
-								$abstracts[$file] = $abstract;
-							}
-							$titles[$file] = $title;
-							$i = $termscount;
-						}
+		}
+		if($case == "insensitive") {
+			for($i=0;$i<$termscount;$i++) {
+				if(eregi($terms[$i],$content)) {
+					if($useabstract) {
+						$abstracts[$file] = get_abstract($content);
 					}
+					$titles[$file] = $title;
+					$i = $termscount;
+					$hits++;
+					continue;
 				}
 			}
 		}
 	}
-	closedir($dp);
 }
+
 $item = count($titles);
 
 /* Search engine has finished, output results */
 print "<h1>Search results</h1>";
-print "<p>";
 if($useabstract) {
+	print "<p>";
 	print "Archive messages are shown with a short abstract.";
+	print " </p>";
 }
-print " </p>";
 
 if(($item == 0)||($item > 1)) {
 	print "<p>$item records matched.</p>\n";
@@ -582,11 +630,36 @@
 if($item == 1) {
 	print "<p>$item record matched.</p>\n";
 }
-print "<p>Search terms: <b>$termstring</b><br>\n";
+
+$totalcount = $pagercount;
+/* $tally contains a count of every file accessed for this page
+ for every $tally pages we had $hits hits
+ total proportion of hits = (hits/tally)*total
+*/
+if($more == TRUE) {
+	$pagercount = ($hits/$tally)*$pagercount;
+	$pagercount = sprintf("%d",$pagercount);
+}
+
+$tally += $pager;
+
+if($more == TRUE) {
+	print "<p><a href=\"$self?terms=$termlist&amp;month=$pagermonth&amp;year=$pageryear";
+	print "&amp;archive=$pagerlist&amp;boolean=$boolean&amp;case=$case";
+	if($useabstract) { print "&amp;abstract=on"; }
+	else { print "&amp;abstract="; }
+	print "&amp;span=$pagerspan&amp;page=$tally";
+	print "\">Next page</a> of upto $perpage of approximately $pagercount results.<br>\n";
+	print "Total number of messages to search: $totalcount\n";
+}
+else {
+	print "<p>End of search.</p>\n";
+}
+print "</p>\n<p>Search terms: <b>$termstring</b><br>\n";
 print "Case <b>$case</b></p>\n";
 print "<p><a href=\"$self\">New Search</a> </p>\n";
 print "<p>Archives searched:<br>\n";
-foreach($start as $item) { print "$item "; }
+foreach($start as $item) { print "<a href=\"/pipermail/$item\">$item</a> "; }
 print "</p>\n<div><ol>\n";
 natcasesort($titles);
 while(list($key,$value) = each($titles)) {
Index: search-tips.html
===================================================================
RCS file: /home/cvs/cvsroot/mail-search/search-tips.html,v
retrieving revision 1.4
retrieving revision 1.5
diff -Lsearch-tips.html -Lsearch-tips.html -u -r1.4 -r1.5
--- search-tips.html
+++ search-tips.html
@@ -147,12 +147,13 @@
 in the <b>gnucash-user</b> archive will be searched.</li>
 <li>If you select a month or year beyond the start date of the archive, the script will use the first
 available month.</li>
-<li>Select the number of previous months to include - the higher the number, the longer the search will take.</li>
+<li>Select the number of previous months to include - the higher the number, the more pages of results
+will be generated and the longer each search will take.</li>
 <li>Abstracts are created from the first 250 characters of an archived message to help you when the subject line
 of the original message is too brief or not apparently linked to the content.</li>
-<li>Use case-sensitive searches whenever you are looking for specific items of hardware, distributions, companies,
-<abbr title="etcetera">etc.</abbr> and enter details like the version number or specific type of problem
-to narrow the search range.</li>
+<li>Use case-sensitive searches whenever you are looking for specific items of hardware,
+distributions, companies, <abbr title="etcetera">etc.</abbr> and enter details like the version number
+or specific type of problem to narrow the search range.</li>
 <li>Select the Boolean value OR to search for any of the search terms entered. (The default search will only list
 files that match ALL search terms.)</li>
 <li><p>The search script will use default settings whenever possible. If you want to link to a
@@ -166,10 +167,10 @@
 <li>case - default : insensitive</li>
 </ul>
 <p>This can shorten a link from:</p>
-<code>https://lists.gnucash.org/search.php?terms=test&amp;archive=gnucash-user&amp;month=5&amp;year=2004
-&amp;span=1&amp;abstract=on&amp;boolean=AND&amp;case=insensitive</code>
+<code>https://lists.gnucash.org/search/search.php?terms=test&amp;archive=gnucash-user
+&amp;month=5&amp;year=2004&amp;span=1&amp;abstract=on&amp;boolean=AND&amp;case=insensitive</code>
 <p> to a much easier:</p>
-<code>https://lists.gnucash.org/search.php?terms=test</code></li></ol>
+<code>https://lists.gnucash.org/search/search.php?terms=test</code></li></ol>
 <p><a href="search.php">Back to search</a> </p>
 </div>
 <div style="clear:both;float:none;">&nbsp;</div>