summaryrefslogtreecommitdiff
path: root/android/webkit/FindAddress.java
blob: 31b242739882a8c8254b8cffe554508be9908d09 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
/*
 * Copyright (C) 2018 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package android.webkit;

import java.util.Locale;
import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Java implementation of legacy WebView.findAddress algorithm.
 *
 * @hide
 */
class FindAddress {
    static class ZipRange {
        int mLow;
        int mHigh;
        int mException1;
        int mException2;
        ZipRange(int low, int high, int exception1, int exception2) {
            mLow = low;
            mHigh = high;
            mException1 = exception1;
            mException2 = exception1;
        }
        boolean matches(String zipCode) {
            int prefix = Integer.parseInt(zipCode.substring(0, 2));
            return (mLow <= prefix && prefix <= mHigh) || prefix == mException1
                    || prefix == mException2;
        }
    }

    // Addresses consist of at least this many words, not including state and zip code.
    private static final int MIN_ADDRESS_WORDS = 4;

    // Adddresses consist of at most this many words, not including state and zip code.
    private static final int MAX_ADDRESS_WORDS = 14;

    // Addresses consist of at most this many lines.
    private static final int MAX_ADDRESS_LINES = 5;

    // No words in an address are longer than this many characters.
    private static final int kMaxAddressNameWordLength = 25;

    // Location name should be in the first MAX_LOCATION_NAME_DISTANCE words
    private static final int MAX_LOCATION_NAME_DISTANCE = 5;

    private static final ZipRange[] sStateZipCodeRanges = {
            new ZipRange(99, 99, -1, -1), // AK Alaska.
            new ZipRange(35, 36, -1, -1), // AL Alabama.
            new ZipRange(71, 72, -1, -1), // AR Arkansas.
            new ZipRange(96, 96, -1, -1), // AS American Samoa.
            new ZipRange(85, 86, -1, -1), // AZ Arizona.
            new ZipRange(90, 96, -1, -1), // CA California.
            new ZipRange(80, 81, -1, -1), // CO Colorado.
            new ZipRange(6, 6, -1, -1), // CT Connecticut.
            new ZipRange(20, 20, -1, -1), // DC District of Columbia.
            new ZipRange(19, 19, -1, -1), // DE Delaware.
            new ZipRange(32, 34, -1, -1), // FL Florida.
            new ZipRange(96, 96, -1, -1), // FM Federated States of Micronesia.
            new ZipRange(30, 31, -1, -1), // GA Georgia.
            new ZipRange(96, 96, -1, -1), // GU Guam.
            new ZipRange(96, 96, -1, -1), // HI Hawaii.
            new ZipRange(50, 52, -1, -1), // IA Iowa.
            new ZipRange(83, 83, -1, -1), // ID Idaho.
            new ZipRange(60, 62, -1, -1), // IL Illinois.
            new ZipRange(46, 47, -1, -1), // IN Indiana.
            new ZipRange(66, 67, 73, -1), // KS Kansas.
            new ZipRange(40, 42, -1, -1), // KY Kentucky.
            new ZipRange(70, 71, -1, -1), // LA Louisiana.
            new ZipRange(1, 2, -1, -1), // MA Massachusetts.
            new ZipRange(20, 21, -1, -1), // MD Maryland.
            new ZipRange(3, 4, -1, -1), // ME Maine.
            new ZipRange(96, 96, -1, -1), // MH Marshall Islands.
            new ZipRange(48, 49, -1, -1), // MI Michigan.
            new ZipRange(55, 56, -1, -1), // MN Minnesota.
            new ZipRange(63, 65, -1, -1), // MO Missouri.
            new ZipRange(96, 96, -1, -1), // MP Northern Mariana Islands.
            new ZipRange(38, 39, -1, -1), // MS Mississippi.
            new ZipRange(55, 56, -1, -1), // MT Montana.
            new ZipRange(27, 28, -1, -1), // NC North Carolina.
            new ZipRange(58, 58, -1, -1), // ND North Dakota.
            new ZipRange(68, 69, -1, -1), // NE Nebraska.
            new ZipRange(3, 4, -1, -1), // NH New Hampshire.
            new ZipRange(7, 8, -1, -1), // NJ New Jersey.
            new ZipRange(87, 88, 86, -1), // NM New Mexico.
            new ZipRange(88, 89, 96, -1), // NV Nevada.
            new ZipRange(10, 14, 0, 6), // NY New York.
            new ZipRange(43, 45, -1, -1), // OH Ohio.
            new ZipRange(73, 74, -1, -1), // OK Oklahoma.
            new ZipRange(97, 97, -1, -1), // OR Oregon.
            new ZipRange(15, 19, -1, -1), // PA Pennsylvania.
            new ZipRange(6, 6, 0, 9), // PR Puerto Rico.
            new ZipRange(96, 96, -1, -1), // PW Palau.
            new ZipRange(2, 2, -1, -1), // RI Rhode Island.
            new ZipRange(29, 29, -1, -1), // SC South Carolina.
            new ZipRange(57, 57, -1, -1), // SD South Dakota.
            new ZipRange(37, 38, -1, -1), // TN Tennessee.
            new ZipRange(75, 79, 87, 88), // TX Texas.
            new ZipRange(84, 84, -1, -1), // UT Utah.
            new ZipRange(22, 24, 20, -1), // VA Virginia.
            new ZipRange(6, 9, -1, -1), // VI Virgin Islands.
            new ZipRange(5, 5, -1, -1), // VT Vermont.
            new ZipRange(98, 99, -1, -1), // WA Washington.
            new ZipRange(53, 54, -1, -1), // WI Wisconsin.
            new ZipRange(24, 26, -1, -1), // WV West Virginia.
            new ZipRange(82, 83, -1, -1) // WY Wyoming.
    };

    // Newlines
    private static final String NL = "\n\u000B\u000C\r\u0085\u2028\u2029";

    // Space characters
    private static final String SP = "\u0009\u0020\u00A0\u1680\u2000\u2001"
            + "\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u202F"
            + "\u205F\u3000";

    // Whitespace
    private static final String WS = SP + NL;

    // Characters that are considered word delimiters.
    private static final String WORD_DELIM = ",*\u2022" + WS;

    // Lookahead for word end.
    private static final String WORD_END = "(?=[" + WORD_DELIM + "]|$)";

    // Address words are a sequence of non-delimiter characters.
    private static final Pattern sWordRe =
            Pattern.compile("[^" + WORD_DELIM + "]+" + WORD_END, Pattern.CASE_INSENSITIVE);

    // Characters that are considered suffix delimiters for house numbers.
    private static final String HOUSE_POST_DELIM = ",\"'" + WS;

    // Lookahead for house end.
    private static final String HOUSE_END = "(?=[" + HOUSE_POST_DELIM + "]|$)";

    // Characters that are considered prefix delimiters for house numbers.
    private static final String HOUSE_PRE_DELIM = ":" + HOUSE_POST_DELIM;

    // A house number component is "one" or a number, optionally
    // followed by a single alphabetic character, or
    private static final String HOUSE_COMPONENT = "(?:one|\\d+([a-z](?=[^a-z]|$)|st|nd|rd|th)?)";

    // House numbers are a repetition of |HOUSE_COMPONENT|, separated by -, and followed by
    // a delimiter character.
    private static final Pattern sHouseNumberRe =
            Pattern.compile(HOUSE_COMPONENT + "(?:-" + HOUSE_COMPONENT + ")*" + HOUSE_END,
                    Pattern.CASE_INSENSITIVE);

    // XXX: do we want to accept whitespace other than 0x20 in state names?
    private static final Pattern sStateRe = Pattern.compile("(?:"
                    + "(ak|alaska)|"
                    + "(al|alabama)|"
                    + "(ar|arkansas)|"
                    + "(as|american[" + SP + "]+samoa)|"
                    + "(az|arizona)|"
                    + "(ca|california)|"
                    + "(co|colorado)|"
                    + "(ct|connecticut)|"
                    + "(dc|district[" + SP + "]+of[" + SP + "]+columbia)|"
                    + "(de|delaware)|"
                    + "(fl|florida)|"
                    + "(fm|federated[" + SP + "]+states[" + SP + "]+of[" + SP + "]+micronesia)|"
                    + "(ga|georgia)|"
                    + "(gu|guam)|"
                    + "(hi|hawaii)|"
                    + "(ia|iowa)|"
                    + "(id|idaho)|"
                    + "(il|illinois)|"
                    + "(in|indiana)|"
                    + "(ks|kansas)|"
                    + "(ky|kentucky)|"
                    + "(la|louisiana)|"
                    + "(ma|massachusetts)|"
                    + "(md|maryland)|"
                    + "(me|maine)|"
                    + "(mh|marshall[" + SP + "]+islands)|"
                    + "(mi|michigan)|"
                    + "(mn|minnesota)|"
                    + "(mo|missouri)|"
                    + "(mp|northern[" + SP + "]+mariana[" + SP + "]+islands)|"
                    + "(ms|mississippi)|"
                    + "(mt|montana)|"
                    + "(nc|north[" + SP + "]+carolina)|"
                    + "(nd|north[" + SP + "]+dakota)|"
                    + "(ne|nebraska)|"
                    + "(nh|new[" + SP + "]+hampshire)|"
                    + "(nj|new[" + SP + "]+jersey)|"
                    + "(nm|new[" + SP + "]+mexico)|"
                    + "(nv|nevada)|"
                    + "(ny|new[" + SP + "]+york)|"
                    + "(oh|ohio)|"
                    + "(ok|oklahoma)|"
                    + "(or|oregon)|"
                    + "(pa|pennsylvania)|"
                    + "(pr|puerto[" + SP + "]+rico)|"
                    + "(pw|palau)|"
                    + "(ri|rhode[" + SP + "]+island)|"
                    + "(sc|south[" + SP + "]+carolina)|"
                    + "(sd|south[" + SP + "]+dakota)|"
                    + "(tn|tennessee)|"
                    + "(tx|texas)|"
                    + "(ut|utah)|"
                    + "(va|virginia)|"
                    + "(vi|virgin[" + SP + "]+islands)|"
                    + "(vt|vermont)|"
                    + "(wa|washington)|"
                    + "(wi|wisconsin)|"
                    + "(wv|west[" + SP + "]+virginia)|"
                    + "(wy|wyoming)"
                    + ")" + WORD_END,
            Pattern.CASE_INSENSITIVE);

    private static final Pattern sLocationNameRe = Pattern.compile("(?:"
                    + "alley|annex|arcade|ave[.]?|avenue|alameda|bayou|"
                    + "beach|bend|bluffs?|bottom|boulevard|branch|bridge|"
                    + "brooks?|burgs?|bypass|broadway|camino|camp|canyon|"
                    + "cape|causeway|centers?|circles?|cliffs?|club|common|"
                    + "corners?|course|courts?|coves?|creek|crescent|crest|"
                    + "crossing|crossroad|curve|circulo|dale|dam|divide|"
                    + "drives?|estates?|expressway|extensions?|falls?|ferry|"
                    + "fields?|flats?|fords?|forest|forges?|forks?|fort|"
                    + "freeway|gardens?|gateway|glens?|greens?|groves?|"
                    + "harbors?|haven|heights|highway|hills?|hollow|inlet|"
                    + "islands?|isle|junctions?|keys?|knolls?|lakes?|land|"
                    + "landing|lane|lights?|loaf|locks?|lodge|loop|mall|"
                    + "manors?|meadows?|mews|mills?|mission|motorway|mount|"
                    + "mountains?|neck|orchard|oval|overpass|parks?|"
                    + "parkways?|pass|passage|path|pike|pines?|plains?|"
                    + "plaza|points?|ports?|prairie|privada|radial|ramp|"
                    + "ranch|rapids?|rd[.]?|rest|ridges?|river|roads?|route|"
                    + "row|rue|run|shoals?|shores?|skyway|springs?|spurs?|"
                    + "squares?|station|stravenue|stream|st[.]?|streets?|"
                    + "summit|speedway|terrace|throughway|trace|track|"
                    + "trafficway|trail|tunnel|turnpike|underpass|unions?|"
                    + "valleys?|viaduct|views?|villages?|ville|vista|walks?|"
                    + "wall|ways?|wells?|xing|xrd)" + WORD_END,
            Pattern.CASE_INSENSITIVE);

    private static final Pattern sSuffixedNumberRe =
            Pattern.compile("(\\d+)(st|nd|rd|th)", Pattern.CASE_INSENSITIVE);

    private static final Pattern sZipCodeRe =
            Pattern.compile("(?:\\d{5}(?:-\\d{4})?)" + WORD_END, Pattern.CASE_INSENSITIVE);

    private static boolean checkHouseNumber(String houseNumber) {
        // Make sure that there are at most 5 digits.
        int digitCount = 0;
        for (int i = 0; i < houseNumber.length(); ++i) {
            if (Character.isDigit(houseNumber.charAt(i))) ++digitCount;
        }
        if (digitCount > 5) return false;

        // Make sure that any ordinals are valid.
        Matcher suffixMatcher = sSuffixedNumberRe.matcher(houseNumber);
        while (suffixMatcher.find()) {
            int num = Integer.parseInt(suffixMatcher.group(1));
            if (num == 0) {
                return false; // 0th is invalid.
            }
            String suffix = suffixMatcher.group(2).toLowerCase(Locale.getDefault());
            switch (num % 10) {
                case 1:
                    return suffix.equals(num % 100 == 11 ? "th" : "st");
                case 2:
                    return suffix.equals(num % 100 == 12 ? "th" : "nd");
                case 3:
                    return suffix.equals(num % 100 == 13 ? "th" : "rd");
                default:
                    return suffix.equals("th");
            }
        }
        return true;
    }

    /**
     * Attempt to match a house number beginnning at position offset
     * in content.  The house number must be followed by a word
     * delimiter or the end of the string, and if offset is non-zero,
     * then it must also be preceded by a word delimiter.
     *
     * @return a MatchResult if a valid house number was found.
     */
    private static MatchResult matchHouseNumber(String content, int offset) {
        if (offset > 0 && HOUSE_PRE_DELIM.indexOf(content.charAt(offset - 1)) == -1) return null;
        Matcher matcher = sHouseNumberRe.matcher(content).region(offset, content.length());
        if (matcher.lookingAt()) {
            MatchResult matchResult = matcher.toMatchResult();
            if (checkHouseNumber(matchResult.group(0))) return matchResult;
        }
        return null;
    }

    /**
     * Attempt to match a US state beginnning at position offset in
     * content.  The matching state must be followed by a word
     * delimiter or the end of the string, and if offset is non-zero,
     * then it must also be preceded by a word delimiter.
     *
     * @return a MatchResult if a valid US state (or two letter code)
     * was found.
     */
    private static MatchResult matchState(String content, int offset) {
        if (offset > 0 && WORD_DELIM.indexOf(content.charAt(offset - 1)) == -1) return null;
        Matcher stateMatcher = sStateRe.matcher(content).region(offset, content.length());
        return stateMatcher.lookingAt() ? stateMatcher.toMatchResult() : null;
    }

    /**
     * Test whether zipCode matches the U.S. zip code format (ddddd or
     * ddddd-dddd) and is within the expected range, given that
     * stateMatch is a match of sStateRe.
     *
     * @return true if zipCode is a valid zip code, is legal for the
     * matched state, and is followed by a word delimiter or the end
     * of the string.
     */
    private static boolean isValidZipCode(String zipCode, MatchResult stateMatch) {
        if (stateMatch == null) return false;
        // Work out the index of the state, based on which group matched.
        int stateIndex = stateMatch.groupCount();
        while (stateIndex > 0) {
            if (stateMatch.group(stateIndex--) != null) break;
        }
        return sZipCodeRe.matcher(zipCode).matches()
                && sStateZipCodeRanges[stateIndex].matches(zipCode);
    }

    /**
     * Test whether location is one of the valid locations.
     *
     * @return true if location starts with a valid location name
     * followed by a word delimiter or the end of the string.
     */
    private static boolean isValidLocationName(String location) {
        return sLocationNameRe.matcher(location).matches();
    }

    /**
     * Attempt to match a complete address in content, starting with
     * houseNumberMatch.
     *
     * @param content The string to search.
     * @param houseNumberMatch A matching house number to start extending.
     * @return +ve: the end of the match
     *         +ve: the position to restart searching for house numbers, negated.
     */
    private static int attemptMatch(String content, MatchResult houseNumberMatch) {
        int restartPos = -1;
        int nonZipMatch = -1;
        int it = houseNumberMatch.end();
        int numLines = 1;
        boolean consecutiveHouseNumbers = true;
        boolean foundLocationName = false;
        int wordCount = 1;
        String lastWord = "";

        Matcher matcher = sWordRe.matcher(content);

        for (; it < content.length(); lastWord = matcher.group(0), it = matcher.end()) {
            if (!matcher.find(it)) {
                // No more words in the input sequence.
                return -content.length();
            }
            if (matcher.end() - matcher.start() > kMaxAddressNameWordLength) {
                // Word is too long to be part of an address. Fail.
                return -matcher.end();
            }

            // Count the number of newlines we just consumed.
            while (it < matcher.start()) {
                if (NL.indexOf(content.charAt(it++)) != -1) ++numLines;
            }

            // Consumed too many lines. Fail.
            if (numLines > MAX_ADDRESS_LINES) break;

            // Consumed too many words. Fail.
            if (++wordCount > MAX_ADDRESS_WORDS) break;

            if (matchHouseNumber(content, it) != null) {
                if (consecutiveHouseNumbers && numLines > 1) {
                    // Last line ended with a number, and this this line starts with one.
                    // Restart at this number.
                    return -it;
                }
                // Remember the position of this match as the restart position.
                if (restartPos == -1) restartPos = it;
                continue;
            }

            consecutiveHouseNumbers = false;

            if (isValidLocationName(matcher.group(0))) {
                foundLocationName = true;
                continue;
            }

            if (wordCount == MAX_LOCATION_NAME_DISTANCE && !foundLocationName) {
                // Didn't find a location name in time. Fail.
                it = matcher.end();
                break;
            }

            if (foundLocationName && wordCount > MIN_ADDRESS_WORDS) {
                // We can now attempt to match a state.
                MatchResult stateMatch = matchState(content, it);
                if (stateMatch != null) {
                    if (lastWord.equals("et") && stateMatch.group(0).equals("al")) {
                        // Reject "et al" as a false postitive.
                        it = stateMatch.end();
                        break;
                    }

                    // At this point we've matched a state; try to match a zip code after it.
                    Matcher zipMatcher = sWordRe.matcher(content);
                    if (zipMatcher.find(stateMatch.end())
                            && isValidZipCode(zipMatcher.group(0), stateMatch)) {
                        return zipMatcher.end();
                    }
                    // The content ends with a state but no zip
                    // code. This is a legal match according to the
                    // documentation. N.B. This differs from the
                    // original c++ implementation, which only allowed
                    // the zip code to be optional at the end of the
                    // string, which presumably is a bug.  Now we
                    // prefer to find a match with a zip code, but
                    // remember non-zip matches and return them if
                    // necessary.
                    nonZipMatch = stateMatch.end();
                }
            }
        }

        if (nonZipMatch > 0) return nonZipMatch;

        return -(restartPos > 0 ? restartPos : it);
    }

    /**
     * Return the first matching address in content.
     *
     * @param content The string to search.
     * @return The first valid address, or null if no address was matched.
     */
    static String findAddress(String content) {
        Matcher houseNumberMatcher = sHouseNumberRe.matcher(content);
        int start = 0;
        while (houseNumberMatcher.find(start)) {
            if (checkHouseNumber(houseNumberMatcher.group(0))) {
                start = houseNumberMatcher.start();
                int end = attemptMatch(content, houseNumberMatcher);
                if (end > 0) {
                    return content.substring(start, end);
                }
                start = -end;
            } else {
                start = houseNumberMatcher.end();
            }
        }
        return null;
    }
}