diff options
author | A. Unique TensorFlower <gardener@tensorflow.org> | 2023-06-13 13:46:03 -0700 |
---|---|---|
committer | TensorFlow Release Automation <jenkins@tensorflow.org> | 2023-06-15 00:03:00 +0000 |
commit | c2713db3dd4bd6eb80f22bf47fce01aebc1abc0a (patch) | |
tree | 19a5aad49261134826ac698e8d743c5f11a3b01d | |
parent | 7c110551ee133313ab903cd4a72b29554e0d4c5c (diff) | |
download | tensorflow-upstream-r2.13-c45a6c0b1cb.tar.gz |
Simplified retry logic to DNS cacheupstream-r2.13-c45a6c0b1cb
PiperOrigin-RevId: 540061371
-rw-r--r-- | tensorflow/tsl/platform/cloud/BUILD | 5 | ||||
-rw-r--r-- | tensorflow/tsl/platform/cloud/gcs_dns_cache.cc | 108 |
2 files changed, 97 insertions, 16 deletions
diff --git a/tensorflow/tsl/platform/cloud/BUILD b/tensorflow/tsl/platform/cloud/BUILD index 223b8f2443b..7d31aa734e3 100644 --- a/tensorflow/tsl/platform/cloud/BUILD +++ b/tensorflow/tsl/platform/cloud/BUILD @@ -82,6 +82,11 @@ cc_library( deps = [ ":http_request", "//tensorflow/tsl/platform:env", + "//tensorflow/tsl/platform:errors", + "//tensorflow/tsl/platform:retrying_utils", + "//tensorflow/tsl/platform:status", + "@com_google_absl//absl/status", + "@com_google_absl//absl/strings", ], ) diff --git a/tensorflow/tsl/platform/cloud/gcs_dns_cache.cc b/tensorflow/tsl/platform/cloud/gcs_dns_cache.cc index 890199661de..92ad032f0dd 100644 --- a/tensorflow/tsl/platform/cloud/gcs_dns_cache.cc +++ b/tensorflow/tsl/platform/cloud/gcs_dns_cache.cc @@ -14,6 +14,14 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/tsl/platform/cloud/gcs_dns_cache.h" + +#include <cstring> + +#include "absl/status/status.h" +#include "absl/strings/str_cat.h" +#include "tensorflow/tsl/platform/errors.h" +#include "tensorflow/tsl/platform/retrying_utils.h" +#include "tensorflow/tsl/platform/status.h" #ifndef _WIN32 #include <arpa/inet.h> #include <netdb.h> @@ -33,19 +41,9 @@ namespace { const std::vector<string>& kCachedDomainNames = *new std::vector<string>{"www.googleapis.com", "storage.googleapis.com"}; -inline void print_getaddrinfo_error(const string& name, int error_code) { -#ifndef _WIN32 - if (error_code == EAI_SYSTEM) { - LOG(ERROR) << "Error resolving " << name - << " (EAI_SYSTEM): " << strerror(errno); - } else { - LOG(ERROR) << "Error resolving " << name << ": " - << gai_strerror(error_code); - } -#else - // TODO:WSAGetLastError is better than gai_strerror - LOG(ERROR) << "Error resolving " << name << ": " << gai_strerror(error_code); -#endif +inline void print_getaddrinfo_error(const string& name, Status return_status) { + // Status doesn't map well to EAI type errors. + LOG(ERROR) << "Error resolving " << name << ": " << return_status; } // Selects one item at random from a vector of items, using a uniform @@ -101,10 +99,88 @@ void GcsDnsCache::AnnotateRequest(HttpRequest* request) { hints.ai_family = AF_INET; // Only use IPv4 for now. hints.ai_socktype = SOCK_STREAM; addrinfo* result = nullptr; - int return_code = getaddrinfo(name.c_str(), nullptr, &hints, &result); + RetryConfig retryConfig( + /* init_delay_time_us = */ 5000, + /* max_delay_time_us = */ 50 * 1000 * 5000, + /* max_retries = */ 5); + + const Status getaddrinfo_status = RetryingUtils::CallWithRetries( + [&name, &hints, &result]() { + int return_code = getaddrinfo(name.c_str(), nullptr, &hints, &result); + absl::Status return_status; + switch (return_code) { + case 0: + return_status = OkStatus(); + break; +#ifndef _WIN32 + case EAI_ADDRFAMILY: + case EAI_SERVICE: + case EAI_SOCKTYPE: + case EAI_NONAME: + return_status = absl::FailedPreconditionError( + absl::StrCat("System in invalid state for getaddrinfo call: ", + gai_strerror(return_code))); + break; + case EAI_AGAIN: + case EAI_NODATA: // lump nodata in here - the domains being resolved + // should always have data + return_status = absl::UnavailableError(absl::StrCat( + "Resolving ", name, " is temporarily unavailable")); + break; + case EAI_BADFLAGS: + case EAI_FAMILY: + return_status = absl::InvalidArgumentError(absl::StrCat( + "Bad arguments for getaddrinfo: ", gai_strerror(return_code))); + break; + case EAI_FAIL: + return_status = absl::NotFoundError( + absl::StrCat("Permanent failure resolving ", name, ": ", + gai_strerror(return_code))); + break; + case EAI_MEMORY: + return_status = absl::ResourceExhaustedError("Out of memory"); + break; + case EAI_SYSTEM: + default: + return_status = absl::UnknownError(strerror(return_code)); +#else + // mapping from + // https://learn.microsoft.com/en-us/windows/win32/api/ws2tcpip/nf-ws2tcpip-getaddrinfo#return-value + case WSATYPE_NOT_FOUND: + case WSAESOCKTNOSUPPORT: + case WSAHOST_NOT_FOUND: + return_status = absl::FailedPreconditionError( + absl::StrCat("System in invalid state for getaddrinfo call: ", + gai_strerror(return_code))); + break; + case WSATRY_AGAIN: + return_status = absl::UnavailableError(absl::StrCat( + "Resolving ", name, " is temporarily unavailable")); + break; + case WSAEINVAL: + case WSAEAFNOSUPPORT: + return_status = absl::InvalidArgumentError(absl::StrCat( + "Bad arguments for getaddrinfo: ", gai_strerror(return_code))); + break; + case WSANO_RECOVERY: + return_status = absl::NotFoundError( + absl::StrCat("Permanent failure resolving ", name, ": ", + gai_strerror(return_code))); + break; + case WSA_NOT_ENOUGH_MEMORY: + return_status = absl::ResourceExhaustedError("Out of memory"); + break; + default: + return_status = absl::UnknownError(strerror(return_code)); +#endif + } + + return Status(return_status); + }, + retryConfig); std::vector<string> output; - if (return_code == 0) { + if (getaddrinfo_status.ok()) { for (const addrinfo* i = result; i != nullptr; i = i->ai_next) { if (i->ai_family != AF_INET || i->ai_addr->sa_family != AF_INET) { LOG(WARNING) << "Non-IPv4 address returned. ai_family: " << i->ai_family @@ -125,7 +201,7 @@ void GcsDnsCache::AnnotateRequest(HttpRequest* request) { } } } else { - print_getaddrinfo_error(name, return_code); + print_getaddrinfo_error(name, getaddrinfo_status); } if (result != nullptr) { freeaddrinfo(result); |