From f7e5353355319524183291d05689e9578c544bf6 Mon Sep 17 00:00:00 2001 From: nick Date: Thu, 6 Jun 2013 14:16:20 +0100 Subject: [PATCH] serve: Add a killswitch that causes the server to uncleanly exit on hang We define a hang as 120 seconds for now; that should be OK (famous last words). When I say unclean, I mean it; the control socket is left hanging around too. This is a workaround for the fact that the client can hang the whole server by sending a write request header specifying > 0 bytes, then uncleanly going away. On the server side, we acquire the IO mutex, and then try to read > 0 bytes from the socket; the data never arrives, and when the client reconnects, its requests never get a response (since we're waiting on that mutex). Getting rid of that mutex (which isn't actually needed, except for migration) would be better. --- src/client.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/client.h | 25 ++++++++++++++++++-- 2 files changed, 89 insertions(+), 2 deletions(-) diff --git a/src/client.c b/src/client.c index 5ca276c..fb99c39 100644 --- a/src/client.c +++ b/src/client.c @@ -20,6 +20,10 @@ struct client *client_create( struct server *serve, int socket ) NULLCHECK( serve ); struct client *c; + struct sigevent evp = { + .sigev_notify = SIGEV_SIGNAL, + .sigev_signo = CLIENT_KILLSWITCH_SIGNAL + }; c = xmalloc( sizeof( struct client ) ); c->stopped = 0; @@ -28,6 +32,11 @@ struct client *client_create( struct server *serve, int socket ) c->stop_signal = self_pipe_create(); + FATAL_IF_NEGATIVE( + timer_create( CLOCK_MONOTONIC, &evp, &(c->killswitch) ), + SHOW_ERRNO( "Failed to create killswitch timer" ) + ); + debug( "Alloced client %p with socket %d", c, socket ); return c; } @@ -45,6 +54,11 @@ void client_destroy( struct client *client ) { NULLCHECK( client ); + FATAL_IF_NEGATIVE( + timer_delete( client->killswitch ), + SHOW_ERRNO( "Couldn't delete killswitch" ) + ); + debug( "Destroying stop signal for client %p", client ); self_pipe_destroy( client->stop_signal ); debug( "Freeing client %p", client ); @@ -476,6 +490,43 @@ void client_reply( struct client* client, struct nbd_request request ) } +/* Starts a timer that will kill the whole process if disarm is not called + * within a timeout (see CLIENT_HANDLE_TIMEOUT). + */ +void client_arm_killswitch( struct client* client ) +{ + struct itimerspec its = { + .it_value = { .tv_nsec = 0, .tv_sec = CLIENT_HANDLER_TIMEOUT }, + .it_interval = { .tv_nsec = 0, .tv_sec = 0 } + }; + + debug( "Arming killswitch" ); + + FATAL_IF_NEGATIVE( + timer_settime( client->killswitch, 0, &its, NULL ), + SHOW_ERRNO( "Failed to arm killswitch" ) + ); + + return; +} + +void client_disarm_killswitch( struct client* client ) +{ + struct itimerspec its = { + .it_value = { .tv_nsec = 0, .tv_sec = 0 }, + .it_interval = { .tv_nsec = 0, .tv_sec = 0 } + }; + + debug( "Disarming killswitch" ); + + FATAL_IF_NEGATIVE( + timer_settime( client->killswitch, 0, &its, NULL ), + SHOW_ERRNO( "Failed to disarm killswitch" ) + ); + + return; +} + /* Returns 0 if we should continue trying to serve requests */ int client_serve_request(struct client* client) { @@ -492,12 +543,27 @@ int client_serve_request(struct client* client) server_lock_io( client->serve ); { if ( !server_is_closed( client->serve ) ) { + /* We arm / disarm around client_reply() to catch cases where the + * remote peer sends part of a write request data before dying, + * and cases where we send part of read reply data before they die. + * + * That last is theoretical right now, but could break us in the + * same way as a half-write (which causes us to sit in read forever) + * + * We only arm/disarm inside the server io lock because it's common + * during migrations for us to be hanging on that mutex for quite + * a while while the final pass happens - it's held for the entire + * time. + */ + client_arm_killswitch( client ); client_reply( client, request ); + client_disarm_killswitch( client ); stop = 0; } } server_unlock_io( client->serve ); + return stop; } diff --git a/src/client.h b/src/client.h index 6cc87ed..8a16109 100644 --- a/src/client.h +++ b/src/client.h @@ -1,6 +1,9 @@ #ifndef CLIENT_H #define CLIENT_H +#include +#include + /** CLIENT_MAX_WAIT_SECS * This is the length of time an inbound migration will wait for a fresh * write before assuming the source has Gone Away. Note: it is *not* @@ -9,6 +12,19 @@ */ #define CLIENT_MAX_WAIT_SECS 5 +/** CLIENT_HANDLER_TIMEOUT + * This is the length of time (in seconds) any request can be outstanding for. + * If we spend longer than this in a request, the whole server is killed. + */ +#define CLIENT_HANDLER_TIMEOUT 120 + +/** CLIENT_KILLSWITCH_SIGNAL + * The signal number we use to kill the server when *any* killswitch timer + * fires. We don't actually need to install a signal handler for it, the default + * behaviour is perfectly fine. + */ +#define CLIENT_KILLSWITCH_SIGNAL ( SIGRTMIN + 1 ) + struct client { /* When we call pthread_join, if the thread is already dead @@ -20,16 +36,20 @@ struct client { */ int stopped; int socket; - + int fileno; char* mapped; struct self_pipe * stop_signal; - + struct server* serve; /* FIXME: remove above duplication */ /* Have we seen a REQUEST_DISCONNECT message? */ int disconnect; + + /* kill the whole server if a request has been outstanding too long */ + timer_t killswitch; + }; @@ -39,3 +59,4 @@ void client_destroy( struct client * client ); void client_signal_stop( struct client * client ); #endif +