001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.codec.language; 018 019import java.util.Locale; 020 021import org.apache.commons.codec.EncoderException; 022import org.apache.commons.codec.StringEncoder; 023 024/** 025 * Match Rating Approach Phonetic Algorithm Developed by <CITE>Western Airlines</CITE> in 1977. 026 * <p> 027 * This class is immutable and thread-safe. 028 * </p> 029 * 030 * @see <a href="http://en.wikipedia.org/wiki/Match_rating_approach">Wikipedia - Match Rating Approach</a> 031 * @since 1.8 032 */ 033public class MatchRatingApproachEncoder implements StringEncoder { 034 035 private static final String SPACE = " "; 036 037 private static final String EMPTY = ""; 038 039 /** 040 * The plain letter equivalent of the accented letters. 041 */ 042 private static final String PLAIN_ASCII = "AaEeIiOoUu" + // grave 043 "AaEeIiOoUuYy" + // acute 044 "AaEeIiOoUuYy" + // circumflex 045 "AaOoNn" + // tilde 046 "AaEeIiOoUuYy" + // umlaut 047 "Aa" + // ring 048 "Cc" + // cedilla 049 "OoUu"; // double acute 050 051 /** 052 * Unicode characters corresponding to various accented letters. For example: \u00DA is U acute etc... 053 */ 054 private static final String UNICODE = "\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9" + 055 "\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD" + 056 "\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177" + 057 "\u00C3\u00E3\u00D5\u00F5\u00D1\u00F1" + 058 "\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF" + 059 "\u00C5\u00E5" + "\u00C7\u00E7" + "\u0150\u0151\u0170\u0171"; 060 061 private static final String[] DOUBLE_CONSONANT = 062 { "BB", "CC", "DD", "FF", "GG", "HH", "JJ", "KK", "LL", "MM", "NN", "PP", "QQ", "RR", "SS", 063 "TT", "VV", "WW", "XX", "YY", "ZZ" }; 064 065 /** 066 * Cleans up a name: 1. Upper-cases everything 2. Removes some common punctuation 3. Removes accents 4. Removes any 067 * spaces. 068 * 069 * <h2>API Usage</h2> 070 * <p> 071 * Consider this method private, it is package protected for unit testing only. 072 * </p> 073 * 074 * @param name 075 * The name to be cleaned 076 * @return The cleaned name 077 */ 078 String cleanName(final String name) { 079 String upperName = name.toUpperCase(Locale.ENGLISH); 080 081 final String[] charsToTrim = { "\\-", "[&]", "\\'", "\\.", "[\\,]" }; 082 for (final String str : charsToTrim) { 083 upperName = upperName.replaceAll(str, EMPTY); 084 } 085 086 upperName = removeAccents(upperName); 087 upperName = upperName.replaceAll("\\s+", EMPTY); 088 089 return upperName; 090 } 091 092 /** 093 * Encodes an Object using the Match Rating Approach algorithm. Method is here to satisfy the requirements of the 094 * Encoder interface Throws an EncoderException if input object is not of type java.lang.String. 095 * 096 * @param pObject 097 * Object to encode 098 * @return An object (or type java.lang.String) containing the Match Rating Approach code which corresponds to the 099 * String supplied. 100 * @throws EncoderException 101 * if the parameter supplied is not of type java.lang.String 102 */ 103 @Override 104 public final Object encode(final Object pObject) throws EncoderException { 105 if (!(pObject instanceof String)) { 106 throw new EncoderException( 107 "Parameter supplied to Match Rating Approach encoder is not of type java.lang.String"); 108 } 109 return encode((String) pObject); 110 } 111 112 /** 113 * Encodes a String using the Match Rating Approach (MRA) algorithm. 114 * 115 * @param name 116 * String object to encode 117 * @return The MRA code corresponding to the String supplied 118 */ 119 @Override 120 public final String encode(String name) { 121 // Bulletproof for trivial input - NINO 122 if (name == null || EMPTY.equalsIgnoreCase(name) || SPACE.equalsIgnoreCase(name) || name.length() == 1) { 123 return EMPTY; 124 } 125 126 // Preprocessing 127 name = cleanName(name); 128 129 // BEGIN: Actual encoding part of the algorithm... 130 // 1. Delete all vowels unless the vowel begins the word 131 name = removeVowels(name); 132 133 // 2. Remove second consonant from any double consonant 134 name = removeDoubleConsonants(name); 135 136 // 3. Reduce codex to 6 letters by joining the first 3 and last 3 letters 137 name = getFirst3Last3(name); 138 139 return name; 140 } 141 142 /** 143 * Gets the first and last 3 letters of a name (if > 6 characters) Else just returns the name. 144 * 145 * <h2>API Usage</h2> 146 * <p> 147 * Consider this method private, it is package protected for unit testing only. 148 * </p> 149 * 150 * @param name 151 * The string to get the substrings from 152 * @return Annexed first and last 3 letters of input word. 153 */ 154 String getFirst3Last3(final String name) { 155 final int nameLength = name.length(); 156 157 if (nameLength > 6) { 158 final String firstThree = name.substring(0, 3); 159 final String lastThree = name.substring(nameLength - 3, nameLength); 160 return firstThree + lastThree; 161 } 162 return name; 163 } 164 165 /** 166 * Obtains the min rating of the length sum of the 2 names. In essence the larger the sum length the smaller the 167 * min rating. Values strictly from documentation. 168 * 169 * <h2>API Usage</h2> 170 * <p> 171 * Consider this method private, it is package protected for unit testing only. 172 * </p> 173 * 174 * @param sumLength 175 * The length of 2 strings sent down 176 * @return The min rating value 177 */ 178 int getMinRating(final int sumLength) { 179 int minRating = 0; 180 181 if (sumLength <= 4) { 182 minRating = 5; 183 } else if (sumLength <= 7) { // already know it is at least 5 184 minRating = 4; 185 } else if (sumLength <= 11) { // already know it is at least 8 186 minRating = 3; 187 } else if (sumLength == 12) { 188 minRating = 2; 189 } else { 190 minRating = 1; // docs said little here. 191 } 192 193 return minRating; 194 } 195 196 /** 197 * Determines if two names are homophonous via Match Rating Approach (MRA) algorithm. It should be noted that the 198 * strings are cleaned in the same way as {@link #encode(String)}. 199 * 200 * @param name1 201 * First of the 2 strings (names) to compare 202 * @param name2 203 * Second of the 2 names to compare 204 * @return {@code true} if the encodings are identical {@code false} otherwise. 205 */ 206 public boolean isEncodeEquals(String name1, String name2) { 207 // Bulletproof for trivial input - NINO 208 if (name1 == null || EMPTY.equalsIgnoreCase(name1) || SPACE.equalsIgnoreCase(name1)) { 209 return false; 210 } 211 if (name2 == null || EMPTY.equalsIgnoreCase(name2) || SPACE.equalsIgnoreCase(name2)) { 212 return false; 213 } 214 if (name1.length() == 1 || name2.length() == 1) { 215 return false; 216 } 217 if (name1.equalsIgnoreCase(name2)) { 218 return true; 219 } 220 221 // Preprocessing 222 name1 = cleanName(name1); 223 name2 = cleanName(name2); 224 225 // Actual MRA Algorithm 226 227 // 1. Remove vowels 228 name1 = removeVowels(name1); 229 name2 = removeVowels(name2); 230 231 // 2. Remove double consonants 232 name1 = removeDoubleConsonants(name1); 233 name2 = removeDoubleConsonants(name2); 234 235 // 3. Reduce down to 3 letters 236 name1 = getFirst3Last3(name1); 237 name2 = getFirst3Last3(name2); 238 239 // 4. Check for length difference - if 3 or greater, then no similarity 240 // comparison is done 241 if (Math.abs(name1.length() - name2.length()) >= 3) { 242 return false; 243 } 244 245 // 5. Obtain the minimum rating value by calculating the length sum of the 246 // encoded Strings and sending it down. 247 final int sumLength = Math.abs(name1.length() + name2.length()); 248 final int minRating = getMinRating(sumLength); 249 250 // 6. Process the encoded Strings from left to right and remove any 251 // identical characters found from both Strings respectively. 252 final int count = leftToRightThenRightToLeftProcessing(name1, name2); 253 254 // 7. Each PNI item that has a similarity rating equal to or greater than 255 // the min is considered to be a good candidate match 256 return count >= minRating; 257 258 } 259 260 /** 261 * Determines if a letter is a vowel. 262 * 263 * <h2>API Usage</h2> 264 * <p> 265 * Consider this method private, it is package protected for unit testing only. 266 * </p> 267 * 268 * @param letter 269 * The letter under investigation 270 * @return True if a vowel, else false 271 */ 272 boolean isVowel(final String letter) { 273 return letter.equalsIgnoreCase("E") || letter.equalsIgnoreCase("A") || letter.equalsIgnoreCase("O") || 274 letter.equalsIgnoreCase("I") || letter.equalsIgnoreCase("U"); 275 } 276 277 /** 278 * Processes the names from left to right (first) then right to left removing identical letters in same positions. 279 * Then subtracts the longer string that remains from 6 and returns this. 280 * 281 * <h2>API Usage</h2> 282 * <p> 283 * Consider this method private, it is package protected for unit testing only. 284 * </p> 285 * 286 * @param name1 287 * name2 288 * @return the length as above 289 */ 290 int leftToRightThenRightToLeftProcessing(final String name1, final String name2) { 291 final char[] name1Char = name1.toCharArray(); 292 final char[] name2Char = name2.toCharArray(); 293 294 final int name1Size = name1.length() - 1; 295 final int name2Size = name2.length() - 1; 296 297 String name1LtRStart = EMPTY; 298 String name1LtREnd = EMPTY; 299 300 String name2RtLStart = EMPTY; 301 String name2RtLEnd = EMPTY; 302 303 for (int i = 0; i < name1Char.length; i++) { 304 if (i > name2Size) { 305 break; 306 } 307 308 name1LtRStart = name1.substring(i, i + 1); 309 name1LtREnd = name1.substring(name1Size - i, name1Size - i + 1); 310 311 name2RtLStart = name2.substring(i, i + 1); 312 name2RtLEnd = name2.substring(name2Size - i, name2Size - i + 1); 313 314 // Left to right... 315 if (name1LtRStart.equals(name2RtLStart)) { 316 name1Char[i] = ' '; 317 name2Char[i] = ' '; 318 } 319 320 // Right to left... 321 if (name1LtREnd.equals(name2RtLEnd)) { 322 name1Char[name1Size - i] = ' '; 323 name2Char[name2Size - i] = ' '; 324 } 325 } 326 327 // Char arrays -> string & remove extraneous space 328 final String strA = new String(name1Char).replaceAll("\\s+", EMPTY); 329 final String strB = new String(name2Char).replaceAll("\\s+", EMPTY); 330 331 // Final bit - subtract the longest string from 6 and return this int value 332 if (strA.length() > strB.length()) { 333 return Math.abs(6 - strA.length()); 334 } 335 return Math.abs(6 - strB.length()); 336 } 337 338 /** 339 * Removes accented letters and replaces with non-accented ASCII equivalent Case is preserved. 340 * http://www.codecodex.com/wiki/Remove_accent_from_letters_%28ex_.%C3%A9_to_e%29 341 * 342 * @param accentedWord 343 * The word that may have accents in it. 344 * @return De-accented word 345 */ 346 String removeAccents(final String accentedWord) { 347 if (accentedWord == null) { 348 return null; 349 } 350 351 final StringBuilder sb = new StringBuilder(); 352 final int n = accentedWord.length(); 353 354 for (int i = 0; i < n; i++) { 355 final char c = accentedWord.charAt(i); 356 final int pos = UNICODE.indexOf(c); 357 if (pos > -1) { 358 sb.append(PLAIN_ASCII.charAt(pos)); 359 } else { 360 sb.append(c); 361 } 362 } 363 364 return sb.toString(); 365 } 366 367 /** 368 * Replaces any double consonant pair with the single letter equivalent. 369 * 370 * <h2>API Usage</h2> 371 * <p> 372 * Consider this method private, it is package protected for unit testing only. 373 * </p> 374 * 375 * @param name 376 * String to have double consonants removed 377 * @return Single consonant word 378 */ 379 String removeDoubleConsonants(final String name) { 380 String replacedName = name.toUpperCase(Locale.ENGLISH); 381 for (final String dc : DOUBLE_CONSONANT) { 382 if (replacedName.contains(dc)) { 383 final String singleLetter = dc.substring(0, 1); 384 replacedName = replacedName.replace(dc, singleLetter); 385 } 386 } 387 return replacedName; 388 } 389 390 /** 391 * Deletes all vowels unless the vowel begins the word. 392 * 393 * <h2>API Usage</h2> 394 * <p> 395 * Consider this method private, it is package protected for unit testing only. 396 * </p> 397 * 398 * @param name 399 * The name to have vowels removed 400 * @return De-voweled word 401 */ 402 String removeVowels(String name) { 403 // Extract first letter 404 final String firstLetter = name.substring(0, 1); 405 406 name = name.replace("A", EMPTY); 407 name = name.replace("E", EMPTY); 408 name = name.replace("I", EMPTY); 409 name = name.replace("O", EMPTY); 410 name = name.replace("U", EMPTY); 411 412 name = name.replaceAll("\\s{2,}\\b", SPACE); 413 414 // return isVowel(firstLetter) ? (firstLetter + name) : name; 415 if (isVowel(firstLetter)) { 416 return firstLetter + name; 417 } 418 return name; 419 } 420}