serve: Add a killswitch that causes the server to uncleanly exit on hang

We define a hang as 120 seconds for now; that should be OK (famous last words). When I say unclean, I mean it; the control socket is left hanging around too. This is a workaround for the fact that the client can hang the whole server by sending a write request header specifying > 0 bytes, then uncleanly going away. On the server side, we acquire the IO mutex, and then try to read > 0 bytes from the socket; the data never arrives, and when the client reconnects, its requests never get a response (since we're waiting on that mutex). Getting rid of that mutex (which isn't actually needed, except for migration) would be better.
2013-06-06 14:16:20 +01:00
parent f9fe421472
commit f7e5353355
2 changed files with 89 additions and 2 deletions
--- a/src/client.c
+++ b/src/client.c
@@ -20,6 +20,10 @@ struct client *client_create( struct server *serve, int socket )
 	NULLCHECK( serve );

 	struct client *c;
+	struct sigevent evp = {
+		.sigev_notify = SIGEV_SIGNAL,
+		.sigev_signo = CLIENT_KILLSWITCH_SIGNAL
+	};

 	c = xmalloc( sizeof( struct client ) );
 	c->stopped = 0;
@@ -28,6 +32,11 @@ struct client *client_create( struct server *serve, int socket )

 	c->stop_signal = self_pipe_create();

+	FATAL_IF_NEGATIVE(
+		timer_create( CLOCK_MONOTONIC, &evp, &(c->killswitch) ),
+		SHOW_ERRNO( "Failed to create killswitch timer" )
+	);
+
 	debug( "Alloced client %p with socket %d", c, socket );
 	return c;
 }
@@ -45,6 +54,11 @@ void client_destroy( struct client *client )
 {
 	NULLCHECK( client );

+	FATAL_IF_NEGATIVE(
+		timer_delete( client->killswitch ),
+		SHOW_ERRNO( "Couldn't delete killswitch" )
+	);
+
 	debug( "Destroying stop signal for client %p", client );
 	self_pipe_destroy( client->stop_signal );
 	debug( "Freeing client %p", client );
@@ -476,6 +490,43 @@ void client_reply( struct client* client, struct nbd_request request )
 }


+/* Starts a timer that will kill the whole process if disarm is not called
+ * within a timeout (see CLIENT_HANDLE_TIMEOUT).
+ */
+void client_arm_killswitch( struct client* client )
+{
+	struct itimerspec its = {
+		.it_value    = { .tv_nsec = 0, .tv_sec = CLIENT_HANDLER_TIMEOUT },
+		.it_interval = { .tv_nsec = 0, .tv_sec = 0 }
+	};
+
+	debug( "Arming killswitch" );
+
+	FATAL_IF_NEGATIVE(
+		timer_settime( client->killswitch, 0, &its, NULL ),
+		SHOW_ERRNO( "Failed to arm killswitch" )
+	);
+
+	return;
+}
+
+void client_disarm_killswitch( struct client* client )
+{
+	struct itimerspec its = {
+		.it_value    = { .tv_nsec = 0, .tv_sec = 0 },
+		.it_interval = { .tv_nsec = 0, .tv_sec = 0 }
+	};
+
+	debug( "Disarming killswitch" );
+
+	FATAL_IF_NEGATIVE(
+		timer_settime( client->killswitch, 0, &its, NULL ),
+		SHOW_ERRNO( "Failed to disarm killswitch" )
+	);
+
+	return;
+}
+
 /* Returns 0 if we should continue trying to serve requests */
 int client_serve_request(struct client* client)
 {
@@ -492,12 +543,27 @@ int client_serve_request(struct client* client)
 	server_lock_io( client->serve );
 	{
 		if ( !server_is_closed( client->serve ) ) {
+			/* We arm / disarm around client_reply() to catch cases where the
+			 * remote peer sends part of a write request data before dying,
+			 * and cases where we send part of read reply data before they die.
+			 *
+			 * That last is theoretical right now, but could break us in the
+			 * same way as a half-write (which causes us to sit in read forever)
+			 *
+			 * We only arm/disarm inside the server io lock because it's common
+			 * during migrations for us to be hanging on that mutex for quite
+			 * a while while the final pass happens - it's held for the entire
+			 * time.
+			 */
+			client_arm_killswitch( client );
 			client_reply( client, request );
+			client_disarm_killswitch( client );
 			stop = 0;
 		}
 	}
 	server_unlock_io( client->serve );

+
 	return stop;
 }

--- a/src/client.h
+++ b/src/client.h
@@ -1,6 +1,9 @@
 #ifndef CLIENT_H
 #define CLIENT_H

+#include <signal.h>
+#include <time.h>
+
 /** CLIENT_MAX_WAIT_SECS
 * This is the length of time an inbound migration will wait for a fresh
 * write before assuming the source has Gone Away.  Note: it is *not*
@@ -9,6 +12,19 @@
 */
 #define CLIENT_MAX_WAIT_SECS 5

+/** CLIENT_HANDLER_TIMEOUT
+ * This is the length of time (in seconds) any request can be outstanding for.
+ * If we spend longer than this in a request, the whole server is killed.
+ */
+#define CLIENT_HANDLER_TIMEOUT 120
+
+/** CLIENT_KILLSWITCH_SIGNAL
+ * The signal number we use to kill the server when *any* killswitch timer
+ * fires. We don't actually need to install a signal handler for it, the default
+ * behaviour is perfectly fine.
+ */
+#define CLIENT_KILLSWITCH_SIGNAL ( SIGRTMIN + 1 )
+

 struct client {
 	/* When we call pthread_join, if the thread is already dead
@@ -20,16 +36,20 @@ struct client {
 	 */
 	int     stopped;
 	int     socket;
-	
+
 	int     fileno;
 	char*   mapped;

 	struct self_pipe * stop_signal;
-	
+
 	struct server* serve; /* FIXME: remove above duplication */

 	/* Have we seen a REQUEST_DISCONNECT message? */
 	int     disconnect;
+
+	/* kill the whole server if a request has been outstanding too long */
+	timer_t killswitch;
+
 };


@@ -39,3 +59,4 @@ void client_destroy( struct client * client );
 void client_signal_stop( struct client * client );

 #endif
+