And for my German collegues here is a version of the Soundex algorithm suited for German pronounciation. This is based on the Cologne phonetics algorithm. More information can be found here: [http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik]
Soundex |
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Text.RegularExpressions; namespace Soundex { publicclassSoundexClass { publicstaticstring createSoundexCodeDE(string word) { string code = "", char0 = "", char1 = "", char2 = ""; word = word.ToLower(); if ((word.Length) < 1) { return""; } // Umwandlung: v->f, w->f, j->i, y->i, ph->f, ä->a, ö->o, ü->u, ß->ss, é->e, è->e, ê->e, à->a, á->a, â->a, ë->e word = word.Replace("ç", "c").Replace("v", "f").Replace("w", "f").Replace("j", "i").Replace("y", "i").Replace("ph", "f").Replace("ä", "a").Replace("ö", "o").Replace("ü", "u").Replace("ß", "ss").Replace("é", "e").Replace("è", "e").Replace("ê", "e").Replace("à", "a").Replace("á", "a").Replace("â", "a").Replace("ë", "e"); // Nur Buchstaben (keine Zahlen, keine Sonderzeichen) word = Regex.Replace(word, @"[^A-Za-z ]+", ""); // Wir hängen bei 1-buchstabigen Strings ein Leerzeichen an, sonst funktioniert die Anlautprüfung auf den zweiten Buchstaben nicht. if (word.Length == 1) { word += " "; } // Sonderfälle bei Wortanfang (Anlaut) int index = 0; if (word.Substring(0, 1) == "c") { // vor a,h,k,l,o,q,r,u,x switch (word.Substring(1, 1)) { case"a": case"h": case"k": case"l": case"o": case"q": case"r": case"u": case"x": code = "4"; break; default: code = "8"; break; } index = 1; } else { index = 0; } //* Umwandlungstabelle: //* ============================================ //* Buchstabe Kontext Code //* ------------- ----------------------- ---- //* A,E,I,J,O,U,Y 0 //* H - //* B 1 //* P nicht vor H 1 //* D,T nicht vor C,S,Z 2 //* F,V,W 3 //* P vor H 3 //* G,K,Q 4 //* C im Wortanfang //* vor A,H,K,L,O,Q,R,U,X 4 //* C vor A,H,K,O,Q,U,X //* ausser nach S,Z 4 //* X nicht nach C,K,Q 48 //* L 5 //* M,N 6 //* R 7 //* S,Z 8 //* C nach S,Z 8 //* C im Wortanfang ausser vor //* A,H,K,L,O,Q,R,U,X 8 //* C nicht vor A,H,K,O,Q,U,X 8 //* D,T vor C,S,Z 8 //* X nach C,K,Q 8 //* -------------------------------------------- while (index < word.Length) { // aktuelle Zeichen char1 = word.Substring(index, 1); // vorherige Zeichen char0 = ""; try { char0 = word.Substring(index - 1, 1); } catch { } // nächsten Zeichen char2 = ""; try { char2 = word.Substring(index + 1, 1); } catch { } switch (char1) { case"a": case"e": case"i": case"o": case"u": code += "0"; break; case"b": case"p": code += "1"; break; case"d": case"t": if ((index + 1) < word.Length) { switch (char2) { case"c": case"s": case"z": code += "8"; break; default: code += "2"; break; } } else { code += "2"; } break; case"f": code += "3"; break; case"g": case"k": case"q": code += "4"; break; case"c": if ((index + 1) < word.Length) { switch (char2) { case"a": case"h": case"k": case"o": case"q": case"u": case"x": switch (char0) { case"s": case"z": code += "8"; break; default: code += "4"; break; } break; default: code += "8"; break; } } else { code += "8"; } break; case"x": if (index > 0) { switch (char0) { case"c": case"k": case"q": code += "8"; break; default: code += "48"; break; } } else { code += "48"; } break; case"l": code += "5"; break; case"m": case"n": code += "6"; break; case"r": code += "7"; break; case"s": case"z": code += "8"; break; } index++; } // entfernen aller Codes "0" ausser am Anfang string code1 = code.Substring(0, 1); code = code.Replace("0", ""); // Mehrfach Codes entfernen und Rückgabe for (int i = 1; i < code.Length; i++) { char1 = code.Substring(i, 1); char0 = ""; try { char0 = code.Substring(i - 1, 1); } catch { } if (char1 != char0) { code1 += char1; } } return code1; } } } |
· I’ve converted the above code to .Net from PHP, so please thoroughly test before putting into production. The original PHP script can be found here: [https://github.com/deezaster/germanphonetic/blob/master/x3m_soundex_ger.php]
The new X++ required to populate the Soundex Table is as follows:
Job to bulk populate Soundex Table (names + addresses) |
staticvoid PopulateSoundexDE(Args _args) { DirPartyTable objDirPartyTable; LogisticsPostalAddress objLogisticsPostalAddress; Soundex objSoundex; List list; ListIterator iterator; str word, text, code; int position; // noise words for entities (DE) container noiseWords1 = ["???", "???", "???"]; // noise word list for addresses (DE) container noiseWords2 = ["???", "???", "???"]; // format for a valid word (alpha chars only) System.Text.RegularExpressions.Regex objRegex1 = new System.Text.RegularExpressions.Regex("^[A-Z][a-z]+$"); // clear [Soundex] table delete_from objSoundex where objSoundex.LanguageId == "de"; // process entity names whileselect RecId, Name from objDirPartyTable { list = new List(Types::String); text = objDirPartyTable.Name; // replace any punctuation and formatting with a word seperator text = strReplace(text, '\n', ' '); text = strReplace(text, '-', ' '); text = strReplace(text, '/', ' '); text = strReplace(text, ',', ' '); text = strReplace(text, '.', ' '); // split text on space list = Global::strSplit(text, " "); iterator = new ListIterator(list); position = 1; while(iterator.more()) { word = iterator.value(); if (objRegex1.IsMatch(word) && conFind(noiseWords1, word) == 0) { objSoundex.initValue(); objSoundex.ContextTableId = tableName2id("DirPartyTable"); objSoundex.ContextRecId = objDirPartyTable.RecId; objSoundex.LanguageId = "de"; objSoundex.Position = position; objSoundex.Word = word; code = Soundex.SoundexClass::createSoundexCodeDE(word); objSoundex.SoundexCode = str2int(code); objSoundex.insert(); } position++; iterator.next(); } } // process entity addresses whileselect RecId, Address from objLogisticsPostalAddress { list = new List(Types::String); text = objLogisticsPostalAddress.Address; // replace any punctuation and formatting with a word seperator text = strReplace(text, '\n', ' '); text = strReplace(text, '-', ' '); text = strReplace(text, '/', ' '); text = strReplace(text, ',', ' '); text = strReplace(text, '.', ' '); // split text on space list = Global::strSplit(text, " "); iterator = new ListIterator(list); position = 1; while(iterator.more()) { word = iterator.value(); if (objRegex1.IsMatch(word) && conFind(noiseWords2, word) == 0) { objSoundex.initValue(); objSoundex.ContextTableId = tableName2id("LogisticsPostalAddress"); objSoundex.ContextRecId = objLogisticsPostalAddress.RecId; objSoundex.LanguageId = "de"; objSoundex.Position = position; objSoundex.Word = word; code = Soundex.SoundexClass::createSoundexCodeDE(word); objSoundex.SoundexCode = str2int(code); objSoundex.insert(); } position++; iterator.next(); } } } |
REGARDS