summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormukesh agrawal <quiche@google.com>2016-11-11 16:20:15 -0800
committermukesh agrawal <quiche@google.com>2016-11-15 11:12:35 -0800
commitfd59221459953a821b9021e00276749ac46ea5ea (patch)
tree05757d3299dab1f296b9714202182c0549fe8cbc
parent960c8a88c83c55c21a3286628591a41d2fb42231 (diff)
downloadwifilogd-fd59221459953a821b9021e00276749ac46ea5ea.tar.gz
MainLoop: improve error handling
Improve error handling, when reading from the log socket: - On transient errors: sleep, to allow the transient condition to pass (and to avoid a tight CPU loop, if the condition is not so transient). - On all other errors: abort the daemon. In the case of a non-transient error, we have two choices: continue running, or abort (and hope the new process comes up in a good state). We to abort, for two reasons: - we assume that newer logs are more important than older ones - even if we kept running, we might not be able to dump the logs that we'd captured so far, short of a core dump. (The request to dump logs would, itself, need to be read from the log socket.) Bug: 32481888 Test: ./runtests.sh (on angler) Change-Id: I68619d192f7682436c9f78b062d1b6070cdabcdf
-rw-r--r--main_loop.cpp20
-rw-r--r--main_loop.h2
-rw-r--r--tests/main_loop_unittest.cpp9
3 files changed, 28 insertions, 3 deletions
diff --git a/main_loop.cpp b/main_loop.cpp
index 62bed31..8ad030a 100644
--- a/main_loop.cpp
+++ b/main_loop.cpp
@@ -30,6 +30,8 @@ namespace wifilogd {
namespace {
constexpr auto kMainBufferSizeBytes = 128 * 1024;
+// TODO(b/32840641): Tune the sleep time.
+constexpr auto kTransientErrorSleepTimeNsec = 100 * 1000; // 100 usec
}
MainLoop::MainLoop(const std::string& socket_name)
@@ -53,8 +55,7 @@ void MainLoop::RunOnce() {
std::tie(datagram_len, err) =
os_->ReceiveDatagram(sock_fd_, input_buf.data(), input_buf.size());
if (err) {
- // TODO(b/32098735): Increment stats counter.
- // TODO(b/32481888): Improve error handling.
+ ProcessError(err);
return;
}
@@ -67,5 +68,20 @@ void MainLoop::RunOnce() {
Os::kInvalidFd);
}
+// Private methods below.
+
+void MainLoop::ProcessError(Os::Errno err) {
+ if (err == EINTR || err == ENOMEM) {
+ // TODO(b/32098735): Increment stats counter.
+ os_->Nanosleep(kTransientErrorSleepTimeNsec);
+ return;
+ }
+
+ // Any other error is unexpected, and assumed to be non-recoverable.
+ // (If, e.g., our socket is in a bad state, then we won't be able to receive
+ // any new log messages.)
+ LOG(FATAL) << "Unexpected error: " << std::strerror(err);
+}
+
} // namespace wifilogd
} // namespace android
diff --git a/main_loop.h b/main_loop.h
index c5a4b48..53c42f8 100644
--- a/main_loop.h
+++ b/main_loop.h
@@ -39,6 +39,8 @@ class MainLoop {
void RunOnce();
private:
+ void ProcessError(Os::Errno err);
+
std::unique_ptr<Os> os_;
std::unique_ptr<CommandProcessor> command_processor_;
// We use an int, rather than a unique_fd, because the file
diff --git a/tests/main_loop_unittest.cpp b/tests/main_loop_unittest.cpp
index db04027..b6c1141 100644
--- a/tests/main_loop_unittest.cpp
+++ b/tests/main_loop_unittest.cpp
@@ -109,9 +109,10 @@ TEST_F(MainLoopTest, RunOnceLimitsMaxSizeReportedToCommandProcessor) {
main_loop_->RunOnce();
}
-TEST_F(MainLoopTest, RunOnceDoesNotPassDataToCommandProcessorOnError) {
+TEST_F(MainLoopTest, RunOnceSleepsAndDoesNotPassDataToCommandProcessorOnError) {
EXPECT_CALL(*os_, ReceiveDatagram(_, _, protocol::kMaxMessageSize))
.WillOnce(Return(std::tuple<size_t, Os::Errno>{0, EINTR}));
+ EXPECT_CALL(*os_, Nanosleep(_));
EXPECT_CALL(*command_processor_, ProcessCommand(_, _, _)).Times(0);
main_loop_->RunOnce();
}
@@ -131,5 +132,11 @@ TEST_F(MainLoopDeathTest, CtorFailureToFetchControlSocketCausesDeath) {
"Failed to get control socket");
}
+TEST_F(MainLoopDeathTest, RunOnceTerminatesOnUnexpectedError) {
+ ON_CALL(*os_, ReceiveDatagram(_, _, protocol::kMaxMessageSize))
+ .WillByDefault(Return(std::tuple<size_t, Os::Errno>{0, EFAULT}));
+ EXPECT_DEATH(main_loop_->RunOnce(), "Unexpected error");
+}
+
} // namespace wifilogd
} // namespace android