summaryrefslogtreecommitdiff
path: root/libchrome_tools/uprev/reconnect_history.py
blob: bb6040c694a528f94d8a236b01dcadb9a1ebc755 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
#!/usr/bin/env python3
# Copyright 2020 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""
Utility to disconnect history of files from a branch, and reconnect with base on
a different branch.
"""

import argparse
import collections
import subprocess
import sys

import filtered_utils
import lazytree
import utils


class CommitMetadataFactory(dict):
    """Dict-like class to read commit metadata"""

    def __missing__(self, key):
        """Reads commit metadata if missing"""
        value = filtered_utils.get_metadata(key)
        self.__setitem__(key, value)
        return value


def disconnect(source_commit, ref_commit):
    """Creates a commit that disconnects files from source_commit.

    All files existing in ref_commit will be removed from source_commit.

    Args:
        source_commit: commit hash to disconnect from.
        ref_commit: commit hash to be a file list reference.
    """
    source_files = utils.get_file_list(source_commit)
    ref_files = utils.get_file_list(ref_commit)
    ref_files_set = set(ref.path for ref in ref_files)
    kept_files = [ref for ref in source_files if ref.path not in ref_files_set]
    tree = utils.git_mktree(kept_files)
    return utils.git_commit(
        tree, [source_commit],
        message=b'Disconnect history from %s' % (source_commit.encode('ascii')))


def connect_base(current_commit, base_commit):
    """Creates a merge commit that takes files from base_commit.

    Literally it's identical to git merge base_commit in current_commit.

    Args:
        current_commit: commit hashes on where to commit to.
        base_commit: commit hashes contains file histories.
    """
    current_files = utils.get_file_list(current_commit)
    base_files = utils.get_file_list(base_commit)
    tree = utils.git_mktree(current_files + base_files)
    return utils.git_commit(
        tree, [current_commit, base_commit],
        message=b'Connect history with base %s' % (base_commit.encode('ascii')))


def blame_files(commithash, files):
    """Blames files on givven commithash"""
    blames = {}
    for path in files:
        blames[path] = utils.git_blame(commithash, path)
    return blames


def search_blame_line(blames, amend_commits, target_commit_hash):
    """Searches blames matching target_commit_hash in amend_commits

    Returns a map from file path to a list of tuple, each tuple has
    len(amend_commits) + 1 elements.  0-th element is the line in blames. and
    1st to n-th element are corresponding lines in amend_commits blaems.

    Args:
        blames: a dict from path to list of GitBlameLine, for files blamed on
            target_commit_hash.
        amend_commits: a list of commit hashes to provide actual history.
        target_commit_hash: commit hash that blames are blaemd on.
    """
    blames_combined = {}
    for blame_file_path, blame_file in blames.items():
        blames_amend = [
            utils.git_blame(commit, blame_file_path) for commit in amend_commits
        ]
        blames_combined[blame_file_path] = [
            blame_combined for blame_combined in zip(blame_file, *blames_amend)
            if blame_combined[0].commit == target_commit_hash
        ]
    return blames_combined


def get_track_from_blames(blames_combined, virtual_goal_commit, amend_commits,
                          commit_choice_cache, commit_msg_cache):
    """Blames diffs and locate the amend commits.

    Returns a tuple containing:
     - a set of commit hashes in amend_commits tree;
     - a line-by-line mapping for files in diff to commit hashes in
       amend_commits tree of diffed lines.

    Args:
        blames_combined: a map from path to a list of tuple. each tuple reflect
            one line, and has len(amend_commits)+1 elements. See more details in
            search_blame_line.
        virtual_goal_commit: a commit that contains no useful history for diffs.
        amend_commits: list of HEAD commit hashes that refers to tree that can
            amend the diffs.
        commit_choice_cache: caches user choice on which amend commit to use.
        commit_msg_cache: caches commit metadata.
    """
    blame_untracked_lines = {}
    commits_to_track = set()

    for blame_file_path, blame_lines in blames_combined.items():
        blame_untracked_lines[blame_file_path] = []
        for blame_line in blame_lines:
            original_commits = tuple(
                blame_amend.commit for blame_amend in list(blame_line)[1:])
            chosen = commit_choice_cache.get(original_commits)
            if chosen is None:
                for idx, original_commit in enumerate(original_commits):
                    print('%d: %s' % (idx,
                                      commit_msg_cache[original_commit].title))
                # No validation on user_choice since no untrusted user.
                # Also the developer can rerun if entered wrongly by accident.
                user_choice = int(input('Choose patch: '))
                chosen = original_commits[user_choice]
                commit_choice_cache[original_commits] = chosen
            commits_to_track.add(chosen)
            blame_untracked_lines[blame_file_path].append((blame_line[0],
                                                           chosen))

    return commits_to_track, blame_untracked_lines


def reconstruct_file(blame_goal, blame_base, lines_to_reconstruct,
                     virtual_goal_commit):
    """Reconstrucs a file to reflect changes in lines_to_reconstruct.

    Takes lines to blame_base, and blame_goal it belongs lines_to_reconstruct.
    It also deletes removed lines nearby.

    Returns a binary for the new file content.

    Args:
        blame_goal: a list of utils.GitBlameLine blaming the file on
            virtual_goal_commit.
        blame_base: a list of utils.GitBlameLine blaming the file on last
            commited commit.
        lines_to_reconstruct: only to reconstruct these lines, instead of
            everything in blame_goal. It is represented in a list of
            GitBlameLine.
        virtual_goal_commit: commit hash where blame_goal is based on.
    """
    idx_base, idx_goal = 0, 0
    reconstructed_file = []

    print('Changed lines are', [line.data for line in lines_to_reconstruct])
    line_iter = iter(lines_to_reconstruct)
    line = next(line_iter, None)
    while idx_base < len(blame_base) or idx_goal< len(blame_goal):
        # Both sides are idendical. We can't compare blame_base, and line
        # directly due to blame commit difference could end up different lineno.
        if (idx_base < len(blame_base) and
                blame_base[idx_base].data == blame_goal[idx_goal].data and
                blame_base[idx_base].commit == blame_goal[idx_goal].commit):
            # We append this line if both sides are identical.
            reconstructed_file.append(blame_base[idx_base].data)
            idx_base += 1
            idx_goal += 1
            should_skip_base = False
        elif line and blame_goal[idx_goal] == line:
            # We append the line from goal, if blame_goal[idx_goal] is the line
            # we're interested in.
            reconstructed_file.append(line.data)
            line = next(line_iter, None)
            idx_goal += 1
            should_skip_base = True
        elif blame_goal[idx_goal].commit == virtual_goal_commit:
            # We skip the line from goal, if the change in not in the commit
            # we're interested. Thus, changed lines in other commits will not be
            # reflected.
            idx_goal += 1
        else:
            # We should skip base if we just appended some lines from goal.
            # This would treat modified lines and append first and skip later.
            # If we didn't append something from goal, lines from base should be
            # preserved because the modified lines are not in the commit we're
            # currently interested in.
            if not should_skip_base:
                reconstructed_file.append(blame_base[idx_base].data)
            idx_base += 1

    return b''.join([line + b'\n' for line in reconstructed_file])


def reconstruct_files(track_commit, blame_untracked_lines, blames,
                      current_base_commit, virtual_goal_commit):
    """Reconstructs files to reflect changes in track_commit.

    Returns a map from file path to file content for reconstructed files.

    Args:
        track_commit: commit hashes to track, and reconstruct from.
        blame_untracked_lines: a line-by-line mapping regarding selected amend
            commits for diffs. see get_track_from_blames for more.
        blames: a map from filename to list of utils.GitBlameLine
        current_base_commit: commit hashes for HEAD of base that contains base
            history + already committed amend history.
        virtual_goal_commit: commit hash for one giant commit that has no
            history.  virtual_goal_commit is one commit ahead of
            current_base_commit.
    """
    lines_to_track = collections.defaultdict(list)
    for file, lines in blame_untracked_lines.items():
        for line in lines:
            if line[1] == track_commit:
                lines_to_track[file].append(line[0])
    constructed_files = {}
    for current_file, current_file_lines in lines_to_track.items():
        print('Reconstructing', current_file, 'for', track_commit)
        blame_base = utils.git_blame(current_base_commit, current_file)
        constructed_files[current_file] = reconstruct_file(
            blames[current_file], blame_base, current_file_lines,
            virtual_goal_commit)
    return constructed_files


def main():
    # Init args
    parser = argparse.ArgumentParser(description='Reconnect git history')
    parser.add_argument(
        'disconnect_from',
        metavar='disconnect_from',
        type=str,
        nargs=1,
        help='disconnect history from this commit')
    parser.add_argument(
        'base_commit',
        metavar='base_commit',
        type=str,
        nargs=1,
        help='base commit to use the history')
    parser.add_argument(
        'amend_commits',
        metavar='amend_commits',
        type=str,
        nargs='+',
        help='commits to amend histories from base_commit')

    arg = parser.parse_args(sys.argv[1:])
    empty_commit = disconnect(arg.disconnect_from[0], arg.base_commit[0])
    connected_base = connect_base(empty_commit, arg.base_commit[0])

    commit_msg_cache = CommitMetadataFactory()
    commit_choice_cache = {}
    last_commit = connected_base
    # In each iteration of the loop, it
    #  - re-create the new goal commit, (base + committed history + (one giant)
    #  uncommited history).
    #  - blame on new goal commit and tot of amend commits. map line-by-line
    #  from uncommited to past histories.
    #  - choose one of the past commits, reconstruct files to reflect changes in
    #  that commit, and create a new commits.
    # last_commit, commit_msg_cache, commit_choice_cache will be persistent
    # across iteratins.
    while True:
        # One commit is processed per iteration.

        # Create virtual target commit, and its diff.
        virtual_goal = utils.git_commit(arg.disconnect_from[0] + '^{tree}',
                                        [last_commit])
        diffs = utils.git_difftree(None, virtual_goal)
        if not diffs:
            print('No diffs are found between %s and goal.' %
                  (last_commit.decode('ascii'),))
            break

        blames = blame_files(virtual_goal,
                             [diff.file.path for diff in diffs])
        blames_combined = search_blame_line(blames, arg.amend_commits,
                                            virtual_goal)

        commits_to_track, blame_untracked_lines = get_track_from_blames(
            blames_combined, virtual_goal, arg.amend_commits,
            commit_choice_cache, commit_msg_cache)
        if not commits_to_track:
            print('no commits to track, stopping')
            break

        # Stablely choose one commit from commits_to_track, and reconstruct it.
        track_commit = min(commits_to_track)
        print('Reconstructing commit %s: %s' %
              (track_commit, commit_msg_cache[track_commit].title))
        constructed_files = reconstruct_files(track_commit,
                                              blame_untracked_lines, blames,
                                              last_commit, virtual_goal)

        # Mktree and commit with re-constructed_files.
        tree = lazytree.LazyTree(filtered_utils.get_metadata(last_commit).tree)
        for filename, filedata in constructed_files.items():
            blob = subprocess.check_output(
                ['git', 'hash-object', '-w', '/dev/stdin'],
                input=filedata).strip()
            tree[filename] = utils.GitFile(filename, tree[filename].mode, blob)
        meta = commit_msg_cache[track_commit]
        last_commit = utils.git_commit(
            tree.hash(), [last_commit],
            (meta.message + b'\n(Reconstructed from ' + track_commit + b')\n'),
            dict(
                GIT_AUTHOR_NAME=meta.authorship.name,
                GIT_AUTHOR_EMAIL=meta.authorship.email,
                GIT_AUTHOR_DATE=b' '.join(
                    [meta.authorship.time, meta.authorship.timezone])))
        print('Reconstructed as', last_commit)
    # Make last commit for history reconstruction.
    print(
        utils.git_commit(
            filtered_utils.get_metadata(arg.disconnect_from[0]).tree,
            [last_commit],
            b'Finished history reconstruction\n\nRemoving unnecessary lines\n'))


if __name__ == '__main__':
    main()