From: Alexey Rodriguez Subject: [htdig] Latin1.patch Date: Fri, 30 Jun 2000 08:49:34 -0700 Hello diggers!!, it took a while because of lack of time but i have finished the patch. This corrects the Weird endings problem that i reported previously in this mailing list. It can now work with ispell files "out of the box" although the only problem was with the spanish one. It also corrects latin1 character codification which previously was coded for german. It now supports every latin1 ispell aff file. I tested with spanish, german and postuguese but it should work for others. mungeWord code is horribly slow but works for now, maybe i'll recode it using regex's to make it faster. E-mail me if you have troubles. Geoff, i had problems for providing the dictionary since mungeWord is a static function so i did what i did. I hope it is ok. Alexey Rodriguez ---8<---- cut here ------8<------ diff -rc htdig-3.1.5/htfuzzy/Endings.h mod/htdig-3.1.5/htfuzzy/Endings.h *** htdig-3.1.5/htfuzzy/Endings.h Fri Feb 25 02:29:10 2000 --- mod/htdig-3.1.5/htfuzzy/Endings.h Thu Jun 29 14:37:09 2000 *************** *** 39,52 **** // int createDB(Configuration &config); ! static void mungeWord(char *, String &); private: Database *root2word; Database *word2root; ! int createRoot(Dictionary &, char *, char *, char *); ! int readRules(Dictionary &, char *); void expandWord(String &, List &, Dictionary &, char *, char *); }; --- 39,52 ---- // int createDB(Configuration &config); ! static void mungeWord(char *, String &, Dictionary &); private: Database *root2word; Database *word2root; ! int createRoot(Dictionary &, char *, char *, char *, Dictionary &); ! int readRules(Dictionary &, char *, Dictionary &); void expandWord(String &, List &, Dictionary &, char *, char *); }; diff -rc htdig-3.1.5/htfuzzy/EndingsDB.cc mod/htdig-3.1.5/htfuzzy/EndingsDB.cc *** htdig-3.1.5/htfuzzy/EndingsDB.cc Fri Feb 25 02:29:10 2000 --- mod/htdig-3.1.5/htfuzzy/EndingsDB.cc Fri Jun 30 10:01:28 2000 *************** *** 25,31 **** int Endings::createDB(Configuration &config) { ! Dictionary rules; String tmpdir = getenv("TMPDIR"); String word2root, root2word; if (tmpdir.length()) --- 25,31 ---- int Endings::createDB(Configuration &config) { ! Dictionary rules, lat_encoding; String tmpdir = getenv("TMPDIR"); String word2root, root2word; if (tmpdir.length()) *************** *** 45,58 **** if (debug) cout << "htfuzzy/endings: Reading rules\n"; ! if (readRules(rules, config["endings_affix_file"]) == NOTOK) return NOTOK; if (debug) cout << "htfuzzy/endings: Creating databases\n"; if (createRoot(rules, word2root, root2word, ! config["endings_dictionary"]) == NOTOK) return NOTOK; // --- 45,65 ---- if (debug) cout << "htfuzzy/endings: Reading rules\n"; ! if (readRules(rules, config["endings_affix_file"], lat_encoding) == NOTOK) return NOTOK; + lat_encoding.Start_Get(); + char *s; + while(s=lat_encoding.Get_Next()) + { + cout<') > 0) { List *list; ! SuffixEntry *se = new SuffixEntry(line); if (rules.Exists(currentSuffix)) { --- 187,193 ---- if (line.indexOf('>') > 0) { List *list; ! SuffixEntry *se = new SuffixEntry(line, lat_encoding); if (rules.Exists(currentSuffix)) { *************** *** 138,144 **** //***************************************************************************** int ! Endings::createRoot(Dictionary &rules, char *word2root, char *root2word, char *dictFile) { FILE *fl = fopen(dictFile, "r"); if (fl == NULL) --- 211,217 ---- //***************************************************************************** int ! Endings::createRoot(Dictionary &rules, char *word2root, char *root2word, char *dictFile, Dictionary & lat_encoding) { FILE *fl = fopen(dictFile, "r"); if (fl == NULL) *************** *** 173,179 **** *p++ = '\0'; ! mungeWord(input, word); expandWord(words, wordList, rules, word, p); if (debug > 1) --- 246,252 ---- *p++ = '\0'; ! mungeWord(input, word, lat_encoding); expandWord(words, wordList, rules, word, p); if (debug > 1) *************** *** 212,281 **** // any accents will be combined into single characters. // void ! Endings::mungeWord(char *input, String &word) { ! char *p = input + 1; word = 0; while (*input) { ! p = input + 1; ! switch (*p) { ! case '"': // The previous character needs to get an umlaut ! switch (*input) ! { ! case 'a': ! case 'A': ! word << char(228); ! input += 2; ! continue; ! break; ! case 'e': ! case 'E': ! word << char(235); ! input += 2; ! continue; ! break; ! case 'i': ! case 'I': ! word << char(239); ! input += 2; ! continue; ! break; ! case 'o': ! case 'O': ! word << char(246); ! input += 2; ! continue; ! break; ! case 'u': ! case 'U': ! word << char(252); ! input += 2; ! continue; ! break; ! } ! break; ! ! case 'S': // See if the previous character needs to be an sz ! if (*input == 's') ! { ! word << char(223); ! input += 2; ! continue; ! } ! else ! { ! word << *input; ! } ! break; ! ! default: ! word << *input; ! break; } - input++; } word.lowercase(); } --- 285,315 ---- // any accents will be combined into single characters. // void ! Endings::mungeWord(char *input, String &word, Dictionary &lat_encoding) { ! char *p = input + 1 , *s; ! int len; word = 0; while (*input) { ! lat_encoding.Start_Get(); ! // Replace ispell codification with latin1 codification ! // Slow, maybe in a next time this will be regexp'd ! while(s = lat_encoding.Get_Next()) { ! if(mystrncasecmp(input ,s ,strlen(s) ) == 0) ! { ! word << (String*) lat_encoding[s]; ! input += strlen( s ); ! break; ! } ! } ! if(!s) // No matches ! { ! word << *input; ! input ++; } } word.lowercase(); } diff -rc htdig-3.1.5/htfuzzy/SuffixEntry.cc mod/htdig-3.1.5/htfuzzy/SuffixEntry.cc *** htdig-3.1.5/htfuzzy/SuffixEntry.cc Fri Feb 25 02:29:10 2000 --- mod/htdig-3.1.5/htfuzzy/SuffixEntry.cc Thu Jun 29 14:44:43 2000 *************** *** 19,27 **** //***************************************************************************** // SuffixEntry::SuffixEntry() // ! SuffixEntry::SuffixEntry(char *str) { ! parse(str); } --- 19,27 ---- //***************************************************************************** // SuffixEntry::SuffixEntry() // ! SuffixEntry::SuffixEntry(char *str, Dictionary &lat_encoding) { ! parse(str, lat_encoding); } *************** *** 38,44 **** // Parse a string in the format '>' into ourselves. // void ! SuffixEntry::parse(char *str) { String temp = 0; --- 38,44 ---- // Parse a string in the format '>' into ourselves. // void ! SuffixEntry::parse(char *str, Dictionary &lat_encoding) { String temp = 0; *************** *** 56,70 **** while (*str == ' ' || *str == '\t' || *str == '>') str++; ! Endings::mungeWord(temp, expression); temp = 0; ! while (*str != ' ' && *str != '\t' && *str != '\n' && *str != '\r' && *str) { ! temp << *str; str++; } ! Endings::mungeWord(temp, rule); } - - --- 56,70 ---- while (*str == ' ' || *str == '\t' || *str == '>') str++; ! Endings::mungeWord(temp, expression, lat_encoding); temp = 0; ! while (*str != '#' && *str != '\n' && *str != '\r' && *str) { ! if(*str!= ' ' && *str!= '\t') { ! temp << *str; ! } str++; } ! Endings::mungeWord(temp, rule, lat_encoding); } diff -rc htdig-3.1.5/htfuzzy/SuffixEntry.h mod/htdig-3.1.5/htfuzzy/SuffixEntry.h *** htdig-3.1.5/htfuzzy/SuffixEntry.h Fri Feb 25 02:29:10 2000 --- mod/htdig-3.1.5/htfuzzy/SuffixEntry.h Thu Jun 29 14:43:08 2000 *************** *** 15,20 **** --- 15,21 ---- #define _SuffixEntry_h_ #include "Object.h" + #include #include *************** *** 24,36 **** // // Construction/Destruction // ! SuffixEntry(char *); ~SuffixEntry(); String expression; String rule; ! void parse(char *str); private: }; --- 25,37 ---- // // Construction/Destruction // ! SuffixEntry(char *, Dictionary &lat_encoding); ~SuffixEntry(); String expression; String rule; ! void parse(char *str, Dictionary &lat_encoding); private: };