From toivo@ucs.uwa.edu.au Sun Aug 20 21:51:21 2000 Date: Mon, 21 Aug 2000 12:07:21 +0800 From: Toivo Pedaste To: htdig3-dev@htdig.org Subject: [htdig3-dev] Rejecting duplicates on htdig These are some rough patches to reject duplicates in htdig by using MD5 checksums, they seem to work. Any comments on them, they still need things adding configuration file parameter, compile test etc. Note: They require libmhash, I have it on my Debian Linux system but I have noidea how common this library is. You need to run automake and autoconf to get the compile to work The md5.cc file goes into the htdig directory The md5hash file gets written to the current directory. -- Toivo Pedaste Email: toivo@ucs.uwa.edu.au University Communications Services, Phone: +61 8 9 380 2605 University of Western Australia Fax: +61 8 9 380 1109 "The time has come", the Walrus said, "to talk of many things"... [Part 2, Text/PLAIN 94 lines] [Unable to print this part] ------------------------------------ To unsubscribe from the htdig3-dev mailing list, send a message to htdig3-dev-unsubscribe@htdig.org You will receive a message to confirm this. diff -u -r htdig3/htdig/Makefile.am htdig3.new/htdig/Makefile.am --- htdig3/htdig/Makefile.am Tue Mar 28 09:44:51 2000 +++ htdig3.new/htdig/Makefile.am Sun Aug 20 22:38:22 2000 @@ -6,11 +6,11 @@ htdig_SOURCES = Document.cc HTML.cc \ Parsable.cc Plaintext.cc \ Retriever.cc Server.cc ExternalTransport.cc \ - URLRef.cc htdig.cc ExternalParser.cc + URLRef.cc htdig.cc ExternalParser.cc md5.cc noinst_HEADERS = Document.h ExternalParser.h HTML.h \ Parsable.h Plaintext.h Retriever.h Server.h URLRef.h htdig.h \ ExternalTransport.h htdig_DEPENDENCIES = $(HTLIBS) -htdig_LDFLAGS = $(PROFILING) +htdig_LDFLAGS = $(PROFILING) -lmhash htdig_LDADD = $(HTLIBS) diff -u -r htdig3/htdig/Retriever.cc htdig3.new/htdig/Retriever.cc --- htdig3/htdig/Retriever.cc Wed Jun 14 09:49:53 2000 +++ htdig3.new/htdig/Retriever.cc Sun Aug 20 22:37:45 2000 @@ -107,6 +107,12 @@ unlink((char*)filelog); } + d_md5 = Database::getDatabaseInstance(DB_HASH); + + if (d_md5->OpenReadWrite("md5hash", 0666) != OK) { + cerr << "DocumentDB::Open: " << "md5hash" << " " << strerror(errno) << "\n"; + } + } @@ -115,6 +121,7 @@ // Retriever::~Retriever() { + d_md5->Close(); delete doc; } @@ -559,10 +566,32 @@ // Determine what to do by looking at the status code returned by // the Document retrieval process. // + +#define MD5_LENGTH 16 + void md5(char *buf, int len, char * rhash); + + String shash; + String sx; + char bhash[16]; + switch (status) { + case Transport::Document_ok: trackWords = 1; + + md5(doc->Contents(),doc->Length(),bhash); + shash.append(bhash,MD5_LENGTH); + + d_md5->Get(shash,sx); + if (!sx.empty()) { + if (debug) { + cout << "DUP\n"; + } + break; + } + d_md5->Put(shash,"x"); + if (old_document) { if (doc->ModTime() == ref->DocTime()) diff -u -r htdig3/htdig/Retriever.h htdig3.new/htdig/Retriever.h --- htdig3/htdig/Retriever.h Tue Mar 28 09:49:12 2000 +++ htdig3.new/htdig/Retriever.h Sun Aug 20 22:43:24 2000 @@ -24,6 +24,7 @@ #include "HtWordReference.h" #include "List.h" #include "StringList.h" +#include "DocumentDB.h" class URL; class Document; @@ -129,6 +130,8 @@ // we reuse. // Document *doc; + + Database *d_md5; String notFound;