aboutsummaryrefslogtreecommitdiff
path: root/src/com/android/providers/contacts/aggregation/util/RawContactMatcher.java
blob: ef5dd87bcd063499c2b68ba31416de9de408d6f0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
/*
 * Copyright (C) 2015 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License
 */
package com.android.providers.contacts.aggregation.util;

import android.util.ArrayMap;
import android.util.Log;
import com.android.providers.contacts.ContactsDatabaseHelper.NameLookupType;
import com.android.providers.contacts.util.Hex;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
 * Logic for matching raw contacts' data.
 */
public class RawContactMatcher {
    private static final String TAG = "ContactMatcher";

    // Suggest to aggregate contacts if their match score is equal or greater than this threshold
    public static final int SCORE_THRESHOLD_SUGGEST = 50;

    public static final int SCORE_THRESHOLD_NO_NAME = 50;

    // Automatically aggregate contacts if their match score is equal or greater than this threshold
    public static final int SCORE_THRESHOLD_PRIMARY = 70;

    // Automatically aggregate contacts if the match score is equal or greater than this threshold
    // and there is a secondary match (phone number, email etc).
    public static final int SCORE_THRESHOLD_SECONDARY = 50;

    // Score for matching phone numbers
    private static final int PHONE_MATCH_SCORE = 71;

    // Score for matching email addresses
    private static final int EMAIL_MATCH_SCORE = 71;

    // Score for matching identity
    private static final int IDENTITY_MATCH_SCORE = 71;

    // Score for matching nickname
    private static final int NICKNAME_MATCH_SCORE = 71;

    // Maximum number of characters in a name to be considered by the matching algorithm.
    private static final int MAX_MATCHED_NAME_LENGTH = 30;

    // Scores a multiplied by this number to allow room for "fractional" scores
    private static final int SCORE_SCALE = 1000;

    public static final int MATCHING_ALGORITHM_EXACT = 0;
    public static final int MATCHING_ALGORITHM_CONSERVATIVE = 1;
    public static final int MATCHING_ALGORITHM_APPROXIMATE = 2;

    // Minimum edit distance between two names to be considered an approximate match
    public static final float APPROXIMATE_MATCH_THRESHOLD = 0.82f;

    // Minimum edit distance between two email ids to be considered an approximate match
    public static final float APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL = 0.95f;

    // Returned value when we found multiple matches and that was not allowed
    public static final long MULTIPLE_MATCHES = -2;

    /**
     * Name matching scores: a matrix by name type vs. candidate lookup type.
     * For example, if the name type is "full name" while we are looking for a
     * "full name", the score may be 99. If we are looking for a "nickname" but
     * find "first name", the score may be 50 (see specific scores defined
     * below.)
     * <p>
     * For approximate matching, we have a range of scores, let's say 40-70.  Depending one how
     * similar the two strings are, the score will be somewhere between 40 and 70, with the exact
     * match producing the score of 70.  The score may also be 0 if the similarity (distance)
     * between the strings is below the threshold.
     * <p>
     * We use a string matching algorithm, which is particularly suited for
     * name matching. See {@link NameDistance}.
     */
    private static int[] sMinScore =
            new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT];
    private static int[] sMaxScore =
            new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT];

    /*
     * Note: the reverse names ({@link NameLookupType#FULL_NAME_REVERSE},
     * {@link NameLookupType#FULL_NAME_REVERSE_CONCATENATED} may appear to be redundant. They are
     * not!  They are useful in three-way aggregation cases when we have, for example, both
     * John Smith and Smith John.  A third contact with the name John Smith should be aggregated
     * with the former rather than the latter.  This is why "reverse" matches have slightly lower
     * scores than direct matches.
     */
    static {
        setScoreRange(NameLookupType.NAME_EXACT,
                NameLookupType.NAME_EXACT, 99, 99);
        setScoreRange(NameLookupType.NAME_COLLATION_KEY,
                NameLookupType.NAME_COLLATION_KEY, 50, 80);

        setScoreRange(NameLookupType.NAME_COLLATION_KEY,
                NameLookupType.EMAIL_BASED_NICKNAME, 30, 60);
        setScoreRange(NameLookupType.NAME_COLLATION_KEY,
                NameLookupType.NICKNAME, 50, 60);

        setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
                NameLookupType.EMAIL_BASED_NICKNAME, 50, 60);
        setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
                NameLookupType.NAME_COLLATION_KEY, 50, 60);
        setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
                NameLookupType.NICKNAME, 50, 60);

        setScoreRange(NameLookupType.NICKNAME,
                NameLookupType.NICKNAME, 50, 60);
        setScoreRange(NameLookupType.NICKNAME,
                NameLookupType.NAME_COLLATION_KEY, 50, 60);
        setScoreRange(NameLookupType.NICKNAME,
                NameLookupType.EMAIL_BASED_NICKNAME, 50, 60);
    }

    /**
     * Populates the cells of the score matrix and score span matrix
     * corresponding to the {@code candidateNameType} and {@code nameType}.
     */
    private static void setScoreRange(int candidateNameType, int nameType, int scoreFrom,
            int scoreTo) {
        int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
        sMinScore[index] = scoreFrom;
        sMaxScore[index] = scoreTo;
    }

    /**
     * Returns the lower range for the match score for the given {@code candidateNameType} and
     * {@code nameType}.
     */
    private static int getMinScore(int candidateNameType, int nameType) {
        int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
        return sMinScore[index];
    }

    /**
     * Returns the upper range for the match score for the given {@code candidateNameType} and
     * {@code nameType}.
     */
    private static int getMaxScore(int candidateNameType, int nameType) {
        int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
        return sMaxScore[index];
    }

    private final ArrayMap<Long, MatchScore> mScores = new ArrayMap<>();
    private final ArrayList<MatchScore> mScoreList = new ArrayList<MatchScore>();
    private int mScoreCount = 0;

    private final NameDistance mNameDistanceConservative = new NameDistance();
    private final NameDistance mNameDistanceApproximate = new NameDistance(MAX_MATCHED_NAME_LENGTH);

    private MatchScore getMatchingScore(long rawContactId, long contactId, long accountId) {
        MatchScore matchingScore = mScores.get(rawContactId);
        if (matchingScore == null) {
            if (mScoreList.size() > mScoreCount) {
                matchingScore = mScoreList.get(mScoreCount);
                matchingScore.reset(rawContactId, contactId, accountId);
            } else {
                matchingScore = new MatchScore(rawContactId, contactId, accountId);
                mScoreList.add(matchingScore);
            }
            mScoreCount++;
            mScores.put(rawContactId, matchingScore);
        }
        return matchingScore;
    }

    /**
     * Checks if there is a match and updates the overall score for the
     * specified contact for a discovered match. The new score is determined
     * by the prior score, by the type of name we were looking for, the type
     * of name we found and, if the match is approximate, the distance between the candidate and
     * actual name.
     */
    public void matchName(long rawContactId, long contactId, long accountId, int
            candidateNameType, String candidateName, int nameType, String name, int algorithm) {
        int maxScore = getMaxScore(candidateNameType, nameType);
        if (maxScore == 0) {
            return;
        }

        if (candidateName.equals(name)) {
            updatePrimaryScore(rawContactId, contactId, accountId, maxScore);
            return;
        }

        if (algorithm == MATCHING_ALGORITHM_EXACT) {
            return;
        }

        int minScore = getMinScore(candidateNameType, nameType);
        if (minScore == maxScore) {
            return;
        }

        final byte[] decodedCandidateName;
        final byte[] decodedName;
        try {
            decodedCandidateName = Hex.decodeHex(candidateName);
            decodedName = Hex.decodeHex(name);
        } catch (RuntimeException e) {
            // How could this happen??  See bug 6827136
            Log.e(TAG, "Failed to decode normalized name.  Skipping.", e);
            return;
        }

        NameDistance nameDistance = algorithm == MATCHING_ALGORITHM_CONSERVATIVE ?
                mNameDistanceConservative : mNameDistanceApproximate;

        int score;
        float distance = nameDistance.getDistance(decodedCandidateName, decodedName);
        boolean emailBased = candidateNameType == NameLookupType.EMAIL_BASED_NICKNAME
                || nameType == NameLookupType.EMAIL_BASED_NICKNAME;
        float threshold = emailBased
                ? APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL
                : APPROXIMATE_MATCH_THRESHOLD;
        if (distance > threshold) {
            score = (int)(minScore +  (maxScore - minScore) * (1.0f - distance));
        } else {
            score = 0;
        }

        updatePrimaryScore(rawContactId, contactId, accountId, score);
    }

    public void matchIdentity(long rawContactId, long contactId, long accountId) {
        updateSecondaryScore(rawContactId, contactId, accountId, IDENTITY_MATCH_SCORE);
    }

    public void updateScoreWithPhoneNumberMatch(long rawContactId, long contactId, long accountId) {
        updateSecondaryScore(rawContactId, contactId, accountId, PHONE_MATCH_SCORE);
    }

    public void updateScoreWithEmailMatch(long rawContactId, long contactId, long accountId) {
        updateSecondaryScore(rawContactId, contactId, accountId, EMAIL_MATCH_SCORE);
    }

    public void updateScoreWithNicknameMatch(long rawContactId, long contactId, long accountId) {
        updateSecondaryScore(rawContactId, contactId, accountId, NICKNAME_MATCH_SCORE);
    }

    private void updatePrimaryScore(long rawContactId, long contactId, long accountId, int score) {
        getMatchingScore(rawContactId, contactId, accountId).updatePrimaryScore(score);
    }

    private void updateSecondaryScore(long rawContactId, long contactId, long accountId,
            int score) {
        getMatchingScore(rawContactId, contactId, accountId).updateSecondaryScore(score);
    }

    public void keepIn(long rawContactId, long contactId, long accountId) {
        getMatchingScore(rawContactId, contactId, accountId).keepIn();
    }

    public void keepOut(long rawContactId, long contactId, long accountId) {
        getMatchingScore(rawContactId, contactId, accountId).keepOut();
    }

    public void clear() {
        mScores.clear();
        mScoreCount = 0;
    }
    /**
     * Returns a list of IDs for raw contacts that are only matched on secondary data elements
     * (phone number, email address, nickname, identity). We need to check if they are missing
     * structured name or not to decide if they should be aggregated.
     * <p>
     * May return null.
     */
    public List<Long> prepareSecondaryMatchCandidates() {
        ArrayList<Long> rawContactIds = null;

        for (int i = 0; i < mScoreCount; i++) {
            MatchScore score = mScoreList.get(i);
            if (score.isKeepOut() ||  score.getPrimaryScore() > SCORE_THRESHOLD_PRIMARY){
                continue;
            }

            if (score.getSecondaryScore() >= SCORE_THRESHOLD_PRIMARY) {
                if (rawContactIds == null) {
                    rawContactIds = new ArrayList<>();
                }
                rawContactIds.add(score.getRawContactId());
            }
            score.setPrimaryScore(0);
        }
        return rawContactIds;
    }

    /**
     * Returns the list of raw contact Ids with the match score over threshold.
     */
    public List<MatchScore> pickBestMatches() {
        final List<MatchScore> matches = new ArrayList<>();
        for (int i = 0; i < mScoreCount; i++) {
            MatchScore score = mScoreList.get(i);
            if (score.isKeepOut()) {
                continue;
            }

            if (score.isKeepIn()) {
                matches.add(score);
                continue;
            }

            if (score.getPrimaryScore() >= SCORE_THRESHOLD_PRIMARY ||
                    (score.getPrimaryScore() == SCORE_THRESHOLD_NO_NAME &&
                            score.getSecondaryScore() > SCORE_THRESHOLD_SECONDARY)) {
                matches.add(score);
            }
        }
        return matches;
    }

    /**
     * Returns matches in the order of descending score.
     */
    public List<MatchScore> pickBestMatches(int threshold) {
        int scaledThreshold = threshold * SCORE_SCALE;
        List<MatchScore> matches = mScoreList.subList(0, mScoreCount);
        Collections.sort(matches);
        int count = 0;
        for (int i = 0; i < mScoreCount; i++) {
            MatchScore matchScore = matches.get(i);
            if (matchScore.getScore() >= scaledThreshold) {
                count++;
            } else {
                break;
            }
        }

        return matches.subList(0, count);
    }

    @Override
    public String toString() {
        return mScoreList.subList(0, mScoreCount).toString();
    }

    public void matchNoName(Long rawContactId, Long contactId, Long accountId) {
        updatePrimaryScore(rawContactId, contactId, accountId, SCORE_THRESHOLD_NO_NAME);
    }
}