src/memory-planner.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203

// Copyright 2020 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <assert.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>

#include <xnnpack/memory-planner.h>
#include <xnnpack/subgraph.h>

// Check if two xnn_value's lifecycles overlap.
inline static bool value_lifecycle_overlap(const struct xnn_value_usage* a, const struct xnn_value_usage* b) {
  assert(a->last_node >= a->first_node);
  assert(b->last_node >= b->first_node);
  if (a->first_node < b->first_node) {
    return a->last_node >= b->first_node;
  } else {
    return b->last_node >= a->first_node;
  }
}

// Use this comparison function to sort xnn_value_usage according to the
// tensor_size in decreasing order.
static inline int cmp_value_usage_tensor_size(const void* a, const void* b) {
  const size_t tensor_size_a = (*(struct xnn_value_usage**)a)->tensor_size;
  const size_t tensor_size_b = (*(struct xnn_value_usage**)b)->tensor_size;
  return (tensor_size_b > tensor_size_a) - (tensor_size_b < tensor_size_a);
}

static void populate_value_lifecycle(const xnn_subgraph_t subgraph, struct xnn_value_usage* usage) {
  assert(subgraph != NULL);
  if (subgraph->num_nodes == 0) {
    return;
  }
  // As we initialized first/last_node in each xnn_value_usage to 0 as in 'xnn_init_value_mem_allocation_tracker',
  // we start with the second node to tell whether first/last_node have been set or not, and check the first node last.
  for (uint32_t nid = 1; nid < subgraph->num_nodes; ++nid) {
    const struct xnn_node* node = subgraph->nodes + nid;
    for (uint32_t i = 0; i < node->num_inputs; ++i) {
      if (usage[node->inputs[i]].first_node == 0) {
        usage[node->inputs[i]].first_node = nid;
      }
      usage[node->inputs[i]].last_node = nid;
    }
    for (uint32_t i = 0; i < node->num_outputs; ++i) {
      if (usage[node->outputs[i]].first_node == 0) {
        usage[node->outputs[i]].first_node = nid;
      }
      usage[node->outputs[i]].last_node = nid;
    }
  }
  const struct xnn_node* first_node = subgraph->nodes;
  for (uint32_t i = 0; i < first_node->num_inputs; ++i) {
    usage[first_node->inputs[i]].first_node = 0;
  }
  for (uint32_t i = 0; i < first_node->num_outputs; ++i) {
    usage[first_node->outputs[i]].first_node = 0;
  }
}

// Represent a memory block [start, end)
struct memory_block {
  size_t start;
  size_t end;
};

// Use this comparison function to sort memory_block according to the 'start'
// in increasing order.
static inline int cmp_memory_block(const void* a, const void* b) {
  const size_t start_a = ((struct memory_block*)a)->start;
  const size_t start_b = ((struct memory_block*)b)->start;
  return (start_a > start_b) - (start_a < start_b);
}

// Given the current live memory blocks, return the offset in a memory arena for a to-be-allocated value of size
// 'to_alloc_size'.
static size_t find_value_alloc_offset(struct memory_block* live_mem_blocks,
                                      size_t num_mem_blocks,
                                      size_t to_alloc_size) {
  if (num_mem_blocks == 0) {
    return 0;
  }

  if (num_mem_blocks == 1) {
    return live_mem_blocks[0].end;
  }

  // Sort memory blocks according to 'start' in increasing order.
  qsort(live_mem_blocks, num_mem_blocks, sizeof(struct memory_block), cmp_memory_block);

  // Coalesce overlapping or immediate adjacent memory blocks to form a list of non-overlapping memory blocks in order
  // to find the smallest gap.
  size_t num_coalesced_mem_blocks = 1;
  for (size_t i = 1; i < num_mem_blocks; ++i) {
    const size_t current_coalesced_end =
        live_mem_blocks[num_coalesced_mem_blocks - 1].end;
    if (live_mem_blocks[i].start > current_coalesced_end) {
      assert(num_coalesced_mem_blocks <= i);
      live_mem_blocks[num_coalesced_mem_blocks] = live_mem_blocks[i];
      num_coalesced_mem_blocks++;
      continue;
    }
    if (live_mem_blocks[i].end > current_coalesced_end) {
      live_mem_blocks[num_coalesced_mem_blocks - 1].end = live_mem_blocks[i].end;
    }
  }

  size_t smallest_gap_size = SIZE_MAX;
  // The first index to live_mem_blocks that the 'to_alloc_size' should be allocated after.
  size_t smallest_gap_index = num_coalesced_mem_blocks - 1;
  for (size_t i = 0; i < num_coalesced_mem_blocks - 1; ++i) {
    assert(live_mem_blocks[i + 1].start > live_mem_blocks[i].end);
    const size_t gap = live_mem_blocks[i + 1].start - live_mem_blocks[i].end;
    if (gap >= to_alloc_size && gap < smallest_gap_size) {
      smallest_gap_index = i;
      smallest_gap_size = gap;
    }
  }
  return live_mem_blocks[smallest_gap_index].end;
}

void xnn_init_value_allocation_tracker(struct xnn_value_allocation_tracker* tracker, const xnn_subgraph_t subgraph) {
  tracker->subgraph = subgraph;
  tracker->mem_arena_size = 0;
  tracker->usage = xnn_allocate_zero_memory(sizeof(struct xnn_value_usage) * subgraph->num_values);
#if XNN_ENABLE_MEMOPT
  populate_value_lifecycle(tracker->subgraph, tracker->usage);
#endif
  tracker->min_value_id = XNN_INVALID_VALUE_ID;
  tracker->max_value_id = XNN_INVALID_VALUE_ID;
}

void xnn_add_value_allocation_tracker(struct xnn_value_allocation_tracker* tracker,
                                      uint32_t value_id,
                                      size_t tensor_size) {
  tracker->usage[value_id].tensor_size = tensor_size;
  if (tracker->min_value_id == XNN_INVALID_VALUE_ID) {
    tracker->min_value_id = value_id;
  } else {
    // Note that values are expected to be added in increasing order.
    assert(value_id > tracker->min_value_id);
    assert(value_id > tracker->max_value_id);
  }

  tracker->max_value_id = value_id;
}

void xnn_plan_value_allocation_tracker(struct xnn_value_allocation_tracker* tracker) {
#if XNN_ENABLE_MEMOPT
  if (tracker->min_value_id == XNN_INVALID_VALUE_ID) {
    assert(tracker->max_value_id == XNN_INVALID_VALUE_ID);
    return;
  }

  const uint32_t num_values = tracker->max_value_id - tracker->min_value_id + 1;
  struct xnn_value_usage** sorted_usage = xnn_allocate_zero_memory(sizeof(struct xnn_value_usage*) * num_values);
  size_t num_values_to_alloc = 0;
  for (size_t i = tracker->min_value_id; i <= tracker->max_value_id; ++i) {
    struct xnn_value_usage* info = tracker->usage + i;
    if (info->tensor_size != 0) {
      sorted_usage[num_values_to_alloc++] = info;
    }
  }
  qsort(sorted_usage, num_values_to_alloc, sizeof(struct xnn_value_usage*), cmp_value_usage_tensor_size);

  // Start the allocation planning process.
  struct memory_block* current_live_mem_blocks = xnn_allocate_zero_memory(
      sizeof(struct memory_block) * num_values_to_alloc);
  size_t mem_arena_size = 0;
  for (size_t i = 0; i < num_values_to_alloc; ++i) {
    size_t num_live_mem_blocks = 0;
    struct xnn_value_usage* current = sorted_usage[i];
    for (size_t j = 0; j < i; ++j) {
      const struct xnn_value_usage* allocated = sorted_usage[j];
      if (value_lifecycle_overlap(current, allocated)) {
        current_live_mem_blocks[num_live_mem_blocks++] = (struct memory_block){
            .start = allocated->alloc_offset,
            .end = allocated->alloc_offset + allocated->tensor_size,
        };
      }
    }
    current->alloc_offset = find_value_alloc_offset(current_live_mem_blocks, num_live_mem_blocks, current->tensor_size);
    if (mem_arena_size < current->alloc_offset + current->tensor_size) {
      mem_arena_size = current->alloc_offset + current->tensor_size;
    }
  }

  tracker->mem_arena_size = mem_arena_size;
  xnn_release_memory(sorted_usage);
  xnn_release_memory(current_live_mem_blocks);
#else
  tracker->mem_arena_size = 0;
  for (uint32_t i = tracker->min_value_id; i <= tracker->max_value_id; ++i) {
    if (tracker->usage[i].tensor_size > 0) {
      tracker->usage[i].alloc_offset = tracker->mem_arena_size;
      tracker->mem_arena_size += tracker->usage[i].tensor_size;
    }
  }
#endif
}