serve: Add a killswitch that causes the server to uncleanly exit on hang
We define a hang as 120 seconds for now; that should be OK (famous last words). When I say unclean, I mean it; the control socket is left hanging around too. This is a workaround for the fact that the client can hang the whole server by sending a write request header specifying > 0 bytes, then uncleanly going away. On the server side, we acquire the IO mutex, and then try to read > 0 bytes from the socket; the data never arrives, and when the client reconnects, its requests never get a response (since we're waiting on that mutex). Getting rid of that mutex (which isn't actually needed, except for migration) would be better.
This commit is contained in:
66
src/client.c
66
src/client.c
@@ -20,6 +20,10 @@ struct client *client_create( struct server *serve, int socket )
|
|||||||
NULLCHECK( serve );
|
NULLCHECK( serve );
|
||||||
|
|
||||||
struct client *c;
|
struct client *c;
|
||||||
|
struct sigevent evp = {
|
||||||
|
.sigev_notify = SIGEV_SIGNAL,
|
||||||
|
.sigev_signo = CLIENT_KILLSWITCH_SIGNAL
|
||||||
|
};
|
||||||
|
|
||||||
c = xmalloc( sizeof( struct client ) );
|
c = xmalloc( sizeof( struct client ) );
|
||||||
c->stopped = 0;
|
c->stopped = 0;
|
||||||
@@ -28,6 +32,11 @@ struct client *client_create( struct server *serve, int socket )
|
|||||||
|
|
||||||
c->stop_signal = self_pipe_create();
|
c->stop_signal = self_pipe_create();
|
||||||
|
|
||||||
|
FATAL_IF_NEGATIVE(
|
||||||
|
timer_create( CLOCK_MONOTONIC, &evp, &(c->killswitch) ),
|
||||||
|
SHOW_ERRNO( "Failed to create killswitch timer" )
|
||||||
|
);
|
||||||
|
|
||||||
debug( "Alloced client %p with socket %d", c, socket );
|
debug( "Alloced client %p with socket %d", c, socket );
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
@@ -45,6 +54,11 @@ void client_destroy( struct client *client )
|
|||||||
{
|
{
|
||||||
NULLCHECK( client );
|
NULLCHECK( client );
|
||||||
|
|
||||||
|
FATAL_IF_NEGATIVE(
|
||||||
|
timer_delete( client->killswitch ),
|
||||||
|
SHOW_ERRNO( "Couldn't delete killswitch" )
|
||||||
|
);
|
||||||
|
|
||||||
debug( "Destroying stop signal for client %p", client );
|
debug( "Destroying stop signal for client %p", client );
|
||||||
self_pipe_destroy( client->stop_signal );
|
self_pipe_destroy( client->stop_signal );
|
||||||
debug( "Freeing client %p", client );
|
debug( "Freeing client %p", client );
|
||||||
@@ -476,6 +490,43 @@ void client_reply( struct client* client, struct nbd_request request )
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* Starts a timer that will kill the whole process if disarm is not called
|
||||||
|
* within a timeout (see CLIENT_HANDLE_TIMEOUT).
|
||||||
|
*/
|
||||||
|
void client_arm_killswitch( struct client* client )
|
||||||
|
{
|
||||||
|
struct itimerspec its = {
|
||||||
|
.it_value = { .tv_nsec = 0, .tv_sec = CLIENT_HANDLER_TIMEOUT },
|
||||||
|
.it_interval = { .tv_nsec = 0, .tv_sec = 0 }
|
||||||
|
};
|
||||||
|
|
||||||
|
debug( "Arming killswitch" );
|
||||||
|
|
||||||
|
FATAL_IF_NEGATIVE(
|
||||||
|
timer_settime( client->killswitch, 0, &its, NULL ),
|
||||||
|
SHOW_ERRNO( "Failed to arm killswitch" )
|
||||||
|
);
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
void client_disarm_killswitch( struct client* client )
|
||||||
|
{
|
||||||
|
struct itimerspec its = {
|
||||||
|
.it_value = { .tv_nsec = 0, .tv_sec = 0 },
|
||||||
|
.it_interval = { .tv_nsec = 0, .tv_sec = 0 }
|
||||||
|
};
|
||||||
|
|
||||||
|
debug( "Disarming killswitch" );
|
||||||
|
|
||||||
|
FATAL_IF_NEGATIVE(
|
||||||
|
timer_settime( client->killswitch, 0, &its, NULL ),
|
||||||
|
SHOW_ERRNO( "Failed to disarm killswitch" )
|
||||||
|
);
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
/* Returns 0 if we should continue trying to serve requests */
|
/* Returns 0 if we should continue trying to serve requests */
|
||||||
int client_serve_request(struct client* client)
|
int client_serve_request(struct client* client)
|
||||||
{
|
{
|
||||||
@@ -492,12 +543,27 @@ int client_serve_request(struct client* client)
|
|||||||
server_lock_io( client->serve );
|
server_lock_io( client->serve );
|
||||||
{
|
{
|
||||||
if ( !server_is_closed( client->serve ) ) {
|
if ( !server_is_closed( client->serve ) ) {
|
||||||
|
/* We arm / disarm around client_reply() to catch cases where the
|
||||||
|
* remote peer sends part of a write request data before dying,
|
||||||
|
* and cases where we send part of read reply data before they die.
|
||||||
|
*
|
||||||
|
* That last is theoretical right now, but could break us in the
|
||||||
|
* same way as a half-write (which causes us to sit in read forever)
|
||||||
|
*
|
||||||
|
* We only arm/disarm inside the server io lock because it's common
|
||||||
|
* during migrations for us to be hanging on that mutex for quite
|
||||||
|
* a while while the final pass happens - it's held for the entire
|
||||||
|
* time.
|
||||||
|
*/
|
||||||
|
client_arm_killswitch( client );
|
||||||
client_reply( client, request );
|
client_reply( client, request );
|
||||||
|
client_disarm_killswitch( client );
|
||||||
stop = 0;
|
stop = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
server_unlock_io( client->serve );
|
server_unlock_io( client->serve );
|
||||||
|
|
||||||
|
|
||||||
return stop;
|
return stop;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
25
src/client.h
25
src/client.h
@@ -1,6 +1,9 @@
|
|||||||
#ifndef CLIENT_H
|
#ifndef CLIENT_H
|
||||||
#define CLIENT_H
|
#define CLIENT_H
|
||||||
|
|
||||||
|
#include <signal.h>
|
||||||
|
#include <time.h>
|
||||||
|
|
||||||
/** CLIENT_MAX_WAIT_SECS
|
/** CLIENT_MAX_WAIT_SECS
|
||||||
* This is the length of time an inbound migration will wait for a fresh
|
* This is the length of time an inbound migration will wait for a fresh
|
||||||
* write before assuming the source has Gone Away. Note: it is *not*
|
* write before assuming the source has Gone Away. Note: it is *not*
|
||||||
@@ -9,6 +12,19 @@
|
|||||||
*/
|
*/
|
||||||
#define CLIENT_MAX_WAIT_SECS 5
|
#define CLIENT_MAX_WAIT_SECS 5
|
||||||
|
|
||||||
|
/** CLIENT_HANDLER_TIMEOUT
|
||||||
|
* This is the length of time (in seconds) any request can be outstanding for.
|
||||||
|
* If we spend longer than this in a request, the whole server is killed.
|
||||||
|
*/
|
||||||
|
#define CLIENT_HANDLER_TIMEOUT 120
|
||||||
|
|
||||||
|
/** CLIENT_KILLSWITCH_SIGNAL
|
||||||
|
* The signal number we use to kill the server when *any* killswitch timer
|
||||||
|
* fires. We don't actually need to install a signal handler for it, the default
|
||||||
|
* behaviour is perfectly fine.
|
||||||
|
*/
|
||||||
|
#define CLIENT_KILLSWITCH_SIGNAL ( SIGRTMIN + 1 )
|
||||||
|
|
||||||
|
|
||||||
struct client {
|
struct client {
|
||||||
/* When we call pthread_join, if the thread is already dead
|
/* When we call pthread_join, if the thread is already dead
|
||||||
@@ -20,16 +36,20 @@ struct client {
|
|||||||
*/
|
*/
|
||||||
int stopped;
|
int stopped;
|
||||||
int socket;
|
int socket;
|
||||||
|
|
||||||
int fileno;
|
int fileno;
|
||||||
char* mapped;
|
char* mapped;
|
||||||
|
|
||||||
struct self_pipe * stop_signal;
|
struct self_pipe * stop_signal;
|
||||||
|
|
||||||
struct server* serve; /* FIXME: remove above duplication */
|
struct server* serve; /* FIXME: remove above duplication */
|
||||||
|
|
||||||
/* Have we seen a REQUEST_DISCONNECT message? */
|
/* Have we seen a REQUEST_DISCONNECT message? */
|
||||||
int disconnect;
|
int disconnect;
|
||||||
|
|
||||||
|
/* kill the whole server if a request has been outstanding too long */
|
||||||
|
timer_t killswitch;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@@ -39,3 +59,4 @@ void client_destroy( struct client * client );
|
|||||||
void client_signal_stop( struct client * client );
|
void client_signal_stop( struct client * client );
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user