// Copyright 2020 Google LLC // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. #ifndef __MACH__ #define _POSIX_C_SOURCE 199309L #endif #include #include #include #include #include // For snprintf. #include #include #include #include #include #include #include #include #include #include #include #if defined(__EMSCRIPTEN__) #include #elif XNN_PLATFORM_WINDOWS #include #else #include #include #endif #ifndef XNN_ENABLE_JIT #error "XNN_ENABLE_JIT is not defined" #endif enum xnn_status xnn_create_workspace(xnn_workspace_t* workspace_out) { if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { xnn_log_error("failed to create workspace: XNNPACK is not initialized"); return xnn_status_uninitialized; } struct xnn_workspace* workspace = NULL; workspace = xnn_allocate_zero_memory(sizeof(struct xnn_workspace)); if (workspace == NULL) { xnn_log_error("failed to allocate %zu bytes for workspace descriptor", sizeof(struct xnn_workspace)); return xnn_status_out_of_memory; } workspace->ref_count = 1; *workspace_out = workspace; return xnn_status_success; } static inline void xnn_retain_workspace(xnn_workspace_t workspace) { workspace->ref_count++; } enum xnn_status xnn_release_workspace(xnn_workspace_t workspace) { assert(workspace->ref_count != 0); if (--workspace->ref_count == 0) { xnn_release_simd_memory(workspace->data); xnn_release_memory(workspace); } return xnn_status_success; } enum xnn_status xnn_create_weights_cache_with_size(size_t size, xnn_weights_cache_t* weights_cache_out) { struct xnn_weights_cache* weights_cache = NULL; enum xnn_status status = xnn_status_uninitialized; if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { xnn_log_error("failed to create weights cache: XNNPACK is not initialized"); goto error; } weights_cache = xnn_allocate_zero_memory(sizeof(struct xnn_weights_cache)); if (weights_cache == NULL) { xnn_log_error("failed to allocate %zu bytes for weights cache descriptor", sizeof(struct xnn_weights_cache)); goto error; } status = xnn_init_weights_cache_with_size(weights_cache, size); if (status != xnn_status_success) { goto error; } *weights_cache_out = weights_cache; return xnn_status_success; error: xnn_release_weights_cache(weights_cache); return status; } enum xnn_status xnn_create_weights_cache(xnn_weights_cache_t* weights_cache_out) { return xnn_create_weights_cache_with_size(XNN_DEFAULT_WEIGHTS_BUFFER_SIZE, weights_cache_out); } enum xnn_status xnn_delete_weights_cache(xnn_weights_cache_t weights_cache) { enum xnn_status status = xnn_release_weights_cache(weights_cache); if (status != xnn_status_success) { return status; } xnn_release_memory(weights_cache); return xnn_status_success; } enum xnn_status xnn_create_runtime( xnn_subgraph_t subgraph, xnn_runtime_t* runtime_out) { return xnn_create_runtime_v2(subgraph, NULL /* threadpool */, 0 /* flags */, runtime_out); } enum xnn_status xnn_create_runtime_v2( xnn_subgraph_t subgraph, pthreadpool_t threadpool, uint32_t flags, xnn_runtime_t* runtime_out) { return xnn_create_runtime_v3(subgraph, /* weights_cache */ NULL, threadpool, flags, runtime_out); } enum xnn_status xnn_create_runtime_v3( xnn_subgraph_t subgraph, xnn_weights_cache_t weights_cache, pthreadpool_t threadpool, uint32_t flags, xnn_runtime_t* runtime_out) { xnn_workspace_t workspace; enum xnn_status status = xnn_create_workspace(&workspace); if (status != xnn_status_success) { return status; } status = xnn_create_runtime_v4(subgraph, weights_cache, workspace, threadpool, flags, runtime_out); // Release workspace regardless of return status of creating runtime. xnn_release_workspace(workspace); return status; } static enum xnn_status initialize_workspace_blobs( xnn_subgraph_t subgraph, xnn_runtime_t runtime, struct xnn_value_allocation_tracker* mem_alloc_tracker) { assert(runtime->workspace != NULL); size_t mem_arena_size = mem_alloc_tracker->mem_arena_size; if (mem_arena_size == 0) { return xnn_status_success; } // Sparse microkernels can read up to 2 * XNN_EXTRA_BYTES beyond array bounds. mem_arena_size += 2 * XNN_EXTRA_BYTES; // Records how much the workspace has moved by due to allocating a larger workspace. ptrdiff_t workspace_data_delta = 0; // Allocates larger workspace here if needed. if (runtime->workspace->size < mem_arena_size) { void* old_workspace_data = runtime->workspace->data; if (runtime->workspace->size != 0) { // Free up the workspace's current data. Free first then allocate to keep peak memory usage low. xnn_release_simd_memory(runtime->workspace->data); } void* new_workspace_data = xnn_allocate_simd_memory(mem_arena_size); if (new_workspace_data == NULL) { xnn_log_error("failed to allocate %zu bytes for runtime workspace", mem_arena_size); return xnn_status_out_of_memory; } runtime->workspace->data = new_workspace_data; runtime->workspace->size = mem_arena_size; // Keep track of how much the workspace data moved. if (old_workspace_data != NULL) { workspace_data_delta = (uintptr_t) new_workspace_data - (uintptr_t) old_workspace_data; } } assert(runtime->workspace->size >= mem_arena_size); // Initialize current runtime's blob pointers. for (size_t i = 0; i < subgraph->num_values; i++) { const struct xnn_value* value = &subgraph->values[i]; struct xnn_blob* blob = &runtime->blobs[i]; if (value->datatype != xnn_datatype_invalid && value->type == xnn_value_type_dense_tensor) { if (blob->allocation_type == xnn_allocation_type_workspace) { // Value is purely internal to the runtime, allocate it in the workspace. blob->data = (void*) ((uintptr_t) runtime->workspace->data + mem_alloc_tracker->usage[i].alloc_offset); } } } // Adjust the blob pointers of all runtimes that share this workspace. if (workspace_data_delta != 0) { for (struct xnn_runtime* rt = runtime->workspace->first_user; rt != NULL; rt = rt->next_workspace_user) { // The current runtime already has the correct offset. if (rt == runtime) { continue; } for (size_t i = 0; i < rt->num_blobs; i++) { struct xnn_blob* blob = &rt->blobs[i]; if (blob->allocation_type == xnn_allocation_type_workspace) { assert(blob->data != NULL); blob->data = (void*) ((uintptr_t) blob->data + workspace_data_delta); } } } } return xnn_status_success; } enum xnn_status xnn_create_runtime_v4( xnn_subgraph_t subgraph, xnn_weights_cache_t weights_cache, xnn_workspace_t workspace, pthreadpool_t threadpool, uint32_t flags, xnn_runtime_t* runtime_out) { struct xnn_runtime* runtime = NULL; enum xnn_status status = xnn_status_uninitialized; if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { xnn_log_error("failed to create runtime: XNNPACK is not initialized"); goto error; } if (workspace == NULL) { xnn_log_error("failed to create runtime: workspace is NULL"); status = xnn_status_invalid_parameter; goto error; } const uint32_t optimization_flags = XNN_FLAG_SPARSE_INFERENCE | XNN_FLAG_HINT_FP16_INFERENCE | XNN_FLAG_FORCE_FP16_INFERENCE | XNN_FLAG_NO_OPERATOR_FUSION; status = xnn_subgraph_optimize(subgraph, flags & optimization_flags); if (status != xnn_status_success) { xnn_log_error("failed to optimize subgraph"); goto error; } status = xnn_status_out_of_memory; runtime = xnn_allocate_zero_memory(sizeof(struct xnn_runtime)); if (runtime == NULL) { xnn_log_error("failed to allocate %zu bytes for runtime descriptor", sizeof(struct xnn_runtime)); goto error; } runtime->opdata = xnn_allocate_zero_memory(sizeof(struct xnn_operator_data) * subgraph->num_nodes); if (runtime->opdata == NULL) { xnn_log_error("failed to allocate %zu bytes for opdata descriptors", sizeof(struct xnn_operator_data) * (size_t) subgraph->num_nodes); goto error; } runtime->num_ops = subgraph->num_nodes; if (flags & XNN_FLAG_YIELD_WORKERS) { struct xnn_node* last_valid_node = NULL; for (size_t i = 0; i < subgraph->num_nodes; i++) { struct xnn_node* node = subgraph->nodes + i; if (node->type != xnn_node_type_invalid) { last_valid_node = node; } } if (last_valid_node != NULL) { last_valid_node->flags |= XNN_FLAG_YIELD_WORKERS; } } struct xnn_code_cache* code_cache = NULL; #if XNN_PLATFORM_JIT && XNN_ENABLE_JIT code_cache = &runtime->code_cache; status = xnn_init_code_cache(code_cache); if (status != xnn_status_success) { goto error; } #endif const struct xnn_caches caches = { .code_cache = code_cache, .weights_cache = weights_cache, }; struct xnn_value* values = subgraph->values; for (size_t i = 0; i < subgraph->num_nodes; i++) { const struct xnn_node* node = subgraph->nodes + i; // Ignore fused nodes if (node->type != xnn_node_type_invalid) { assert(node->create != NULL); status = node->create(node, values, subgraph->num_values, runtime->opdata + i, &caches); if (status != xnn_status_success) { goto error; } runtime->opdata[i].setup = node->setup; } } #if XNN_PLATFORM_JIT && XNN_ENABLE_JIT xnn_finalize_code_memory(&code_cache->cache.code); #endif runtime->blobs = xnn_allocate_zero_memory(sizeof(struct xnn_blob) * subgraph->num_values); if (runtime->blobs == NULL) { xnn_log_error("failed to allocate %zu bytes for blob descriptors", sizeof(struct xnn_blob) * (size_t) subgraph->num_values); goto error; } runtime->num_blobs = subgraph->num_values; struct xnn_value_allocation_tracker mem_alloc_tracker; xnn_init_value_allocation_tracker(&mem_alloc_tracker, subgraph); for (uint32_t i = 0; i < subgraph->num_values; i++) { struct xnn_value* value = &subgraph->values[i]; struct xnn_blob* blob = &runtime->blobs[i]; if (value->datatype != xnn_datatype_invalid && value->type == xnn_value_type_dense_tensor) { blob->size = xnn_tensor_get_size(subgraph, i); blob->data = (void*) (uintptr_t) value->data; if (blob->data == NULL) { if ((value->flags & (XNN_VALUE_FLAG_EXTERNAL_INPUT | XNN_VALUE_FLAG_EXTERNAL_OUTPUT)) == 0) { // Value is purely internal to the runtime, and must be allocated in its workspace. xnn_add_value_allocation_tracker(&mem_alloc_tracker, i, round_up_po2(blob->size, XNN_EXTRA_BYTES)); blob->allocation_type = xnn_allocation_type_workspace; } else { // Value is non-static and external to the runtime: must be specified via a call to xnn_setup_runtime. blob->allocation_type = xnn_allocation_type_external; } } else { blob->allocation_type = xnn_allocation_type_static; } } } xnn_plan_value_allocation_tracker(&mem_alloc_tracker); xnn_retain_workspace(workspace); runtime->workspace = workspace; runtime->next_workspace_user = runtime->workspace->first_user; runtime->workspace->first_user = runtime; status = initialize_workspace_blobs(subgraph, runtime, &mem_alloc_tracker); if (status != xnn_status_success) { xnn_release_value_allocation_tracker(&mem_alloc_tracker); goto error; } if (flags & XNN_FLAG_BASIC_PROFILING) { runtime->profiling = true; } xnn_release_value_allocation_tracker(&mem_alloc_tracker); runtime->threadpool = threadpool; *runtime_out = runtime; return xnn_status_success; error: xnn_delete_runtime(runtime); return status; } enum xnn_status xnn_setup_runtime( xnn_runtime_t runtime, size_t num_external_values, const struct xnn_external_value* external_values) { // Validate inputs without changing internal state. // This ensures that runtime stays in consistent state in case validation fails midway. for (size_t i = 0; i < num_external_values; i++) { const struct xnn_external_value* external_value = &external_values[i]; const uint32_t value_id = external_value->id; if (value_id >= runtime->num_blobs) { xnn_log_error("failed to setup runtime: out-of-bounds ID %" PRIu32 " in external value #%zu", value_id, i); return xnn_status_invalid_parameter; } const struct xnn_blob* blob = &runtime->blobs[value_id]; if (blob->allocation_type != xnn_allocation_type_external) { xnn_log_error("failed to setup runtime: Value %" PRIu32 " is not external", value_id); return xnn_status_invalid_parameter; } } // Apply runtime state changes. for (size_t i = 0; i < num_external_values; i++) { const struct xnn_external_value* external_value = &external_values[i]; const uint32_t value_id = external_value->id; struct xnn_blob* blob = &runtime->blobs[value_id]; blob->data = external_value->data; } for (size_t i = 0; i < runtime->num_ops; i++) { const struct xnn_operator_data* opdata = &runtime->opdata[i]; if (opdata->operator_objects[0] == NULL) { // Operator was removed during optimization continue; } // Ensure that weights cache is finalized. struct xnn_weights_cache* weights_cache = opdata->operator_objects[0]->weights_cache; if (weights_cache != NULL && !xnn_weights_cache_is_finalized(weights_cache)) { xnn_log_error("weights cache needs to be finalized before setup/infer"); return xnn_status_invalid_state; } assert(opdata->setup != NULL); const enum xnn_status status = opdata->setup(opdata, runtime->blobs, runtime->num_blobs, runtime->threadpool); if (status != xnn_status_success) { xnn_log_error("failed to setup runtime: error in operator #%zu", i); return status; } } return xnn_status_success; } static xnn_timestamp xnn_read_timer() { xnn_timestamp timestamp; #ifdef __MACH__ timestamp = clock_gettime_nsec_np(CLOCK_UPTIME_RAW); if (timestamp == 0) { xnn_log_warning("clock_gettime failed: error code %d", errno); } #elif __EMSCRIPTEN__ timestamp = emscripten_get_now(); #elif XNN_PLATFORM_WINDOWS BOOL res = QueryPerformanceCounter(×tamp); if (!res) { xnn_log_error("QueryPerformanceCounter failed: error code %u", GetLastError()); memset(×tamp, 0, sizeof(timestamp)); } #else int res = clock_gettime(CLOCK_MONOTONIC, ×tamp); if (res != 0) { xnn_log_error("clock_gettime failed: error code %d", errno); memset(×tamp, 0, sizeof(timestamp)); } #endif return timestamp; } static inline uint64_t xnn_get_elapsed_time(const xnn_timestamp* start, const xnn_timestamp* end) { #ifdef __MACH__ const uint64_t kMicrosInNanos = 1000; return (*end - *start) / kMicrosInNanos; #elif __EMSCRIPTEN__ const double kMillisInMicros = 1.0e3; return (uint64_t) ((*end - *start) * kMillisInMicros); #elif XNN_PLATFORM_WINDOWS const uint64_t kMicrosInSec = 1000 * 1000; LARGE_INTEGER frequency; BOOL res = QueryPerformanceFrequency(&frequency); if (!res) { xnn_log_error("QueryPerformanceFrequency failed: error code %u", GetLastError()); return 0; } return ((end->QuadPart - start->QuadPart) * kMicrosInSec) / frequency.QuadPart; #else const uint64_t kNanosInMicro = UINT64_C(1000); const uint64_t kNanosInSec = UINT64_C(1000000000); const uint64_t secs = (end->tv_sec - start->tv_sec) * kNanosInSec; const uint64_t ns_secs = (end->tv_nsec - start->tv_nsec); return (secs + ns_secs) / kNanosInMicro; #endif } enum xnn_status xnn_get_runtime_profiling_info(xnn_runtime_t runtime, enum xnn_profile_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) { if (!runtime->profiling) { return xnn_status_invalid_state; } enum xnn_status status = xnn_status_success; size_t required_size = 0; const struct xnn_operator_data* opdata = runtime->opdata; switch (param_name) { case xnn_profile_info_num_operators: required_size = sizeof(size_t); if (param_value_size < required_size){ *param_value_size_ret = required_size; status = xnn_status_out_of_memory; } else { size_t num_valid_ops = 0; for (size_t i = 0; i < runtime->num_ops; ++i) { if (opdata[i].operator_objects[0] != NULL) { num_valid_ops += 1; } } memcpy(param_value, &num_valid_ops, required_size); } break; case xnn_profile_info_operator_name: for (size_t i = 0; i < runtime->num_ops; ++i) { if (opdata[i].operator_objects[0] != NULL) { const char* op_name = xnn_operator_type_to_string(opdata[i].operator_objects[0]->type); size_t op_name_len = strlen(op_name) + 1; if (opdata[i].operator_objects[0]->ukernel.type != xnn_ukernel_type_default ) { op_name_len += strlen(xnn_ukernel_type_to_string(opdata[i].operator_objects[0]->ukernel.type)) + 1; } required_size += op_name_len; } } if (param_value_size < required_size) { *param_value_size_ret = required_size; status = xnn_status_out_of_memory; } else { char* name_out = (char*) param_value; for (size_t i = 0; i < runtime->num_ops; ++i) { if (opdata[i].operator_objects[0] != NULL) { const char* op_name = xnn_operator_type_to_string(opdata[i].operator_objects[0]->type); size_t op_name_len = strlen(op_name) + 1; if (opdata[i].operator_objects[0]->ukernel.type != xnn_ukernel_type_default ) { const char* ukernel_type = xnn_ukernel_type_to_string(opdata[i].operator_objects[0]->ukernel.type); op_name_len += strlen(ukernel_type) + 1; snprintf(name_out, op_name_len, "%s %s", op_name, ukernel_type); } else { snprintf(name_out, op_name_len, "%s", op_name); } name_out += op_name_len; } } } break; case xnn_profile_info_operator_timing: { size_t num_valid_ops = 0; for (size_t i = 0; i < runtime->num_ops; ++i) { if (opdata[i].operator_objects[0] != NULL) { num_valid_ops += 1; } } required_size = num_valid_ops * sizeof(uint64_t); if (param_value_size < required_size) { *param_value_size_ret = required_size; status = xnn_status_out_of_memory; } else { xnn_timestamp previous_ts = runtime->start_ts; uint64_t* data = (uint64_t*) param_value; for (size_t i = 0; i < runtime->num_ops; ++i) { if (opdata[i].operator_objects[0] != NULL) { uint64_t op_time = 0; for (size_t j = 0; j < XNN_MAX_OPERATOR_OBJECTS; j++) { if (opdata[i].operator_objects[j] != NULL) { op_time += xnn_get_elapsed_time(&previous_ts, &opdata[i].end_ts[j]); previous_ts = opdata[i].end_ts[j]; } } *data++ = op_time; } } } break; } default: status = xnn_status_invalid_parameter; } return status; } enum xnn_status xnn_invoke_runtime( xnn_runtime_t runtime) { if (runtime->profiling) { runtime->start_ts = xnn_read_timer(); } for (size_t i = 0; i < runtime->num_ops; i++) { for (size_t j = 0; j < XNN_MAX_OPERATOR_OBJECTS; j++) { if (runtime->opdata[i].operator_objects[j] == NULL) { // Operator was removed after fusion continue; } const enum xnn_status status = xnn_run_operator(runtime->opdata[i].operator_objects[j], runtime->threadpool); if (status != xnn_status_success) { return status; } if (runtime->profiling) { runtime->opdata[i].end_ts[j] = xnn_read_timer(); } } } return xnn_status_success; } enum xnn_status xnn_delete_runtime( xnn_runtime_t runtime) { if (runtime != NULL) { if (runtime->opdata != NULL) { for (size_t i = 0; i < runtime->num_ops; i++) { for (size_t j = 0; j < XNN_MAX_OPERATOR_OBJECTS; j++) { xnn_delete_operator(runtime->opdata[i].operator_objects[j]); } } xnn_release_memory(runtime->opdata); xnn_release_memory(runtime->blobs); if (runtime->workspace != NULL) { // Remove this runtime from the list of users of the workspace. assert(runtime->workspace->first_user != NULL); if (runtime->workspace->first_user == runtime) { runtime->workspace->first_user = runtime->next_workspace_user; } else { xnn_runtime_t prev = runtime->workspace->first_user; xnn_runtime_t curr = prev->next_workspace_user; while (curr != runtime) { prev = curr; curr = curr->next_workspace_user; } assert(curr == runtime); prev->next_workspace_user = curr->next_workspace_user; } xnn_release_workspace(runtime->workspace); } } #if XNN_PLATFORM_JIT && XNN_ENABLE_JIT xnn_release_code_cache(&runtime->code_cache); #endif xnn_release_memory(runtime); } return xnn_status_success; }