Date: Tue, 20 Apr 2004 10:16:30 -0400 From: Christopher Murtagh To: Joe R. Jah Cc: htdig-dev@lists.sourceforge.net, lha@users.sourceforge.net Subject: Re: [htdig-dev] Solved! - Re: Performance issue with exclude_urls Ok, I've solved my problem, and can now have a list of working exclude_urls without the serious performance decrease. Here are the changes I made (sorry I'm not sending a proper diff file... need guidance on how to do that properly): htdig/htdig.h -------------------- added: extern int exclude_checked; extern int badquerystr_checked; extern HtRegexList excludes; extern HtRegexList badquerystr; htdig/htdig.cc ---------------------- added these as global variable definitions: int exclude_checked = 0; int badquerystr_checked = 0; HtRegexList excludes; HtRegexList badquerystr; htdig/Retriever.cc added these conditionals and removed the previous tmplist creates and .setEscaped() calls: if(!(exclude_checked)){ //only parse this once and store into global variable tmpList.Destroy(); tmpList.Create(config->Find(&aUrl, "exclude_urls"), " \t"); excludes.setEscaped(tmpList, config->Boolean("case_sensitive")); exclude_checked = 1; } if(!(badquerystr_checked)){ //only parse this once and store into global variable tmpList.Destroy(); tmpList.Create(config->Find(&aUrl, "bad_querystr"), " \t"); badquerystr.setEscaped(tmpList, config->Boolean("case_sensitive")); badquerystr_checked = 1; } The difference in performance is night and day, and the excludes list is only parsed once per dig rather than at *every* URL found. If this is at all useful to anyone, let me know. I can send files or if someone would enlighten me (even RTFM me) I can send diff/patches. Cheers, Chris -- Christopher Murtagh Enterprise Systems Administrator ISR / Web Communications Group McGill University Montreal, Quebec Canada Tel.: (514) 398-3122 Fax: (514) 398-2017 --- htdig.h.original 2004-04-20 10:05:55.000000000 -0400 +++ htdig.h 2004-04-20 10:08:47.000000000 -0400 @@ -40,7 +40,11 @@ #include extern int debug; +extern int exclude_parsed; +extern int badquerystr_parsed; extern DocumentDB docs; +extern HtRegexList excludes; +extern HtRegexList badquerystr; extern HtRegexList limits; extern HtRegexList limitsn; extern HtRegexList excludes; --- htdig.cc.original 2004-04-20 10:05:46.000000000 -0400 +++ htdig.cc 2004-04-20 10:08:35.000000000 -0400 @@ -57,7 +57,12 @@ // int debug = 0; int report_statistics = 0; +int exclude_parsed = 0; +int badquerystr_parsed = 0; + DocumentDB docs; +HtRegexList excludes; +HtRegexList badquerystr; HtRegexList limits; HtRegexList limitsn; FILE *urls_seen = NULL; --- Retriever.cc.original 2004-04-20 10:06:18.000000000 -0400 +++ Retriever.cc 2004-04-20 10:09:02.000000000 -0400 @@ -995,9 +995,15 @@ // If the URL contains any of the patterns in the exclude list, // mark it as invalid // - tmpList.Create(config->Find(&aUrl, "exclude_urls"), " \t"); - HtRegexList excludes; - excludes.setEscaped(tmpList, config->Boolean("case_sensitive")); + + if(!(exclude_parsed)){ + //only parse this once and store into global variable + tmpList.Destroy(); + tmpList.Create(config->Find(&aUrl, "exclude_urls"), " \t"); + excludes.setEscaped(tmpList, config->Boolean("case_sensitive")); + exclude_parsed = 1; + } + if (excludes.match(url, 0, 0) != 0) { if (debug > 2) @@ -1009,10 +1015,14 @@ // If the URL has a query string and it is in the bad query list // mark it as invalid // - tmpList.Destroy(); - tmpList.Create(config->Find(&aUrl, "bad_querystr"), " \t"); - HtRegexList badquerystr; - badquerystr.setEscaped(tmpList, config->Boolean("case_sensitive")); + + if(!(badquerystr_parsed)){ + //only parse this once and store into global variable + tmpList.Destroy(); + tmpList.Create(config->Find(&aUrl, "bad_querystr"), " \t"); + badquerystr.setEscaped(tmpList, config->Boolean("case_sensitive")); + badquerystr_parsed = 1; + } char *ext = strrchr((char *) url, '?'); if (ext && badquerystr.match(ext, 0, 0) != 0) {