From 97a923afdf953cd97747b7671836e7dafe2b4f4e Mon Sep 17 00:00:00 2001 From: nick Date: Wed, 23 Oct 2013 15:58:47 +0100 Subject: [PATCH] mirror: Don't start migrating until the allocation map is built There is a fun race that can happen if we begin migrating while the allocation map is still building. We call bitset_enable_stream() when the migration begins, which causes the builder to start putting events into the stream. This is bad all by itself, as it slows the migration down for no reason, but the stream is a limited-size queue and there are situations (migration fails and is restarted) where we can end up with the queue full and nobody able to empty it, freezing the whole thing. --- src/mirror.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++------ src/serve.c | 1 + src/serve.h | 2 ++ 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/src/mirror.c b/src/mirror.c index dbf34c7..acc5144 100644 --- a/src/mirror.c +++ b/src/mirror.c @@ -70,6 +70,7 @@ struct mirror_ctrl { /* libev stuff */ struct ev_loop *ev_loop; + ev_timer begin_watcher; ev_io read_watcher; ev_io write_watcher; ev_timer timeout_watcher; @@ -669,6 +670,7 @@ void mirror_abandon_cb( struct ev_loop *loop, ev_io *w, int revents ) return; } + void mirror_limit_cb( struct ev_loop *loop, ev_timer *w, int revents ) { struct mirror_ctrl* ctrl = (struct mirror_ctrl*) w->data; @@ -693,6 +695,36 @@ void mirror_limit_cb( struct ev_loop *loop, ev_timer *w, int revents ) return; } +/* We use this to periodically check whether the allocation map has built, and + * if it has, start migrating. If it's not finished, then enabling the bitset + * stream does not go well for us. + */ +void mirror_begin_cb( struct ev_loop *loop, ev_timer *w, int revents ) +{ + struct mirror_ctrl* ctrl = (struct mirror_ctrl*) w->data; + NULLCHECK( ctrl ); + + if ( !(revents & EV_TIMER ) ) { + warn( "Mirror limit callback executed but no timer event signalled" ); + return; + } + + if ( ctrl->serve->allocation_map_built || ctrl->serve->allocation_map_not_built ) { + debug( "allocation map builder is finished, beginning migration" ); + /* Start by writing xfer 0 to the listener */ + ev_io_start( loop, &ctrl->write_watcher ); + /* We want to timeout during the first write as well as subsequent ones */ + ev_timer_again( loop, &ctrl->timeout_watcher ); + /* We're now interested in events */ + bitset_enable_stream( ctrl->serve->allocation_map ); + } else { + /* not done yet, so wait another second */ + ev_timer_again( loop, w ); + } + + return; +} + void mirror_run( struct server *serve ) { NULLCHECK( serve ); @@ -723,6 +755,10 @@ void mirror_run( struct server *serve ) ctrl.ev_loop = EV_DEFAULT; /* gcc warns on -O2. clang is fine. Seems to be the fault of ev.h */ + ev_init( &ctrl.begin_watcher, mirror_begin_cb ); + ctrl.begin_watcher.repeat = 1.0; // We check bps every second. seems sane. + ctrl.begin_watcher.data = (void*) &ctrl; + ev_io_init( &ctrl.read_watcher, mirror_read_cb, m->client, EV_READ ); ctrl.read_watcher.data = (void*) &ctrl; @@ -746,19 +782,23 @@ void mirror_run( struct server *serve ) "Couldn't find first transfer for mirror!" ); - /* Start by writing xfer 0 to the listener */ - ev_io_start( ctrl.ev_loop, &ctrl.write_watcher ); - /* We want to timeout during the first write as well as subsequent ones */ - ev_timer_again( ctrl.ev_loop, &ctrl.timeout_watcher ); + if ( serve->allocation_map_built ) { + /* Start by writing xfer 0 to the listener */ + ev_io_start( ctrl.ev_loop, &ctrl.write_watcher ); + /* We want to timeout during the first write as well as subsequent ones */ + ev_timer_again( ctrl.ev_loop, &ctrl.timeout_watcher ); + bitset_enable_stream( serve->allocation_map ); + } else { + debug( "Waiting for allocation map to be built" ); + ev_timer_again( ctrl.ev_loop, &ctrl.begin_watcher ); + } /* Everything up to here is blocking. We switch to non-blocking so we * can handle rate-limiting and weird error conditions better. TODO: We * should expand the event loop upwards so we can do the same there too */ sock_set_nonblock( m->client, 1 ); - bitset_enable_stream( serve->allocation_map ); - info( "Entering event loop" ); ev_run( ctrl.ev_loop, 0 ); info( "Exited event loop" ); diff --git a/src/serve.c b/src/serve.c index d7b3883..80605bc 100644 --- a/src/serve.c +++ b/src/serve.c @@ -686,6 +686,7 @@ void* build_allocation_map_thread(void* serve_uncast) * the future, we'll need to wait for the allocation map to finish or * fail before we can complete the migration. */ + serve->allocation_map_not_built = 1; warn( "Didn't build allocation map for %s", serve->filename ); } diff --git a/src/serve.h b/src/serve.h index 8d1043a..bb9513c 100644 --- a/src/serve.h +++ b/src/serve.h @@ -76,8 +76,10 @@ struct server { struct bitset * allocation_map; /* when starting up, this thread builds the allocation_map */ pthread_t allocation_map_builder_thread; + /* when the thread has finished, it sets this to 1 */ volatile sig_atomic_t allocation_map_built; + volatile sig_atomic_t allocation_map_not_built; int max_nbd_clients; struct client_tbl_entry *nbd_client;