From ce9499efce3b7ccceb5d5c4fc9f2eb75f830a782 Mon Sep 17 00:00:00 2001 From: Patrick J Cherry Date: Fri, 7 Dec 2018 13:59:49 +0000 Subject: [PATCH 1/5] Rubocop; add test to bombard a migration source with status commands --- .../acceptance/test_write_during_migration.rb | 46 ++++++++++++++++--- 1 file changed, 40 insertions(+), 6 deletions(-) mode change 100644 => 100755 tests/acceptance/test_write_during_migration.rb diff --git a/tests/acceptance/test_write_during_migration.rb b/tests/acceptance/test_write_during_migration.rb old mode 100644 new mode 100755 index dd259c5..053fcc2 --- a/tests/acceptance/test_write_during_migration.rb +++ b/tests/acceptance/test_write_during_migration.rb @@ -82,15 +82,14 @@ class TestWriteDuringMigration < Test::Unit::TestCase UNIXSocket.open(@source_sock) do |sock| sock.write(['mirror', '127.0.0.1', @dest_port.to_s, 'exit'].join("\x0A") + "\x0A\x0A") sock.flush - rsp = sock.readline + sock.readline end end def wait_for_quit Timeout.timeout(10) do - start_time = Time.now - dst_result = Process.waitpid2(@dst_proc) - src_result = Process.waitpid2(@src_proc) + Process.waitpid2(@dst_proc) + Process.waitpid2(@src_proc) end end @@ -100,13 +99,28 @@ class TestWriteDuringMigration < Test::Unit::TestCase loop do begin client.write(offsets[rand(offsets.size)] * 4096, @write_data) - rescue StandardError => err + rescue StandardError # We expect a broken write at some point, so ignore it break end end end + def bombard_with_status + loop do + begin + UNIXSocket.open(@source_sock) do |sock| + sock.write("status\x0A\x0A") + sock.flush + sock.readline + end + rescue Errno::ENOENT + # If the socket disappears, that's OK. + break + end + end + end + def assert_both_sides_identical # puts `md5sum #{@source_file} #{@dest_file}` @@ -160,5 +174,25 @@ class TestWriteDuringMigration < Test::Unit::TestCase (src_writers_1 + src_writers_2).each(&:join) assert_both_sides_identical end - end end + end + end + + + def test_status_call_after_cleanup + Dir.mktmpdir do |tmpdir| + Dir.chdir(tmpdir) do + make_files + + launch_servers + + status_poker = Thread.new { bombard_with_status } + + start_mirror + + wait_for_quit + status_poker.join + assert_both_sides_identical + end + end + end end From 70a3a4bb55e621947f2eac4af99f5a6c41dcd03a Mon Sep 17 00:00:00 2001 From: Patrick J Cherry Date: Fri, 7 Dec 2018 15:02:55 +0000 Subject: [PATCH 2/5] Close the control socket during cleanup This should prevent further requests coming in, triggering deadlocks. --- src/server/serve.c | 14 +++++--------- tests/acceptance/test_write_during_migration.rb | 2 +- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/server/serve.c b/src/server/serve.c index e5cc588..af7afb4 100644 --- a/src/server/serve.c +++ b/src/server/serve.c @@ -827,6 +827,11 @@ void serve_cleanup(struct server *params, close(params->server_fd); } + /* close the control socket too */ + if (params->flexnbd && params->flexnbd->control) { + control_signal_close(params->flexnbd->control); + } + /* need to stop background build if we're killed very early on */ pthread_cancel(params->allocation_map_builder_thread); pthread_join(params->allocation_map_builder_thread, &status); @@ -861,15 +866,6 @@ void serve_cleanup(struct server *params, server_unlock_acl(params); } - /* if( params->flexnbd ) { */ - /* if ( params->flexnbd->control ) { */ - /* flexnbd_stop_control( params->flexnbd ); */ - /* } */ - /* flexnbd_destroy( params->flexnbd ); */ - /* } */ - - /* server_destroy( params ); */ - debug("Cleanup done"); } diff --git a/tests/acceptance/test_write_during_migration.rb b/tests/acceptance/test_write_during_migration.rb index 053fcc2..70ef9e6 100755 --- a/tests/acceptance/test_write_during_migration.rb +++ b/tests/acceptance/test_write_during_migration.rb @@ -114,7 +114,7 @@ class TestWriteDuringMigration < Test::Unit::TestCase sock.flush sock.readline end - rescue Errno::ENOENT + rescue StandardError # If the socket disappears, that's OK. break end From 5839a36ab10ed60411ab6e1c0c5069109d6b9084 Mon Sep 17 00:00:00 2001 From: Patrick J Cherry Date: Fri, 7 Dec 2018 15:05:19 +0000 Subject: [PATCH 3/5] Remove useless function definition --- src/server/serve.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/server/serve.c b/src/server/serve.c index af7afb4..28970aa 100644 --- a/src/server/serve.c +++ b/src/server/serve.c @@ -812,8 +812,6 @@ void server_control_arrived(struct server *serve) } -void flexnbd_stop_control(struct flexnbd *flexnbd); - /** Closes sockets, frees memory and waits for all client threads to finish */ void serve_cleanup(struct server *params, int fatal __attribute__ ((unused))) From 842e7d362d0c6231ba71341b07b51dc5cdcc0914 Mon Sep 17 00:00:00 2001 From: Patrick J Cherry Date: Fri, 7 Dec 2018 16:32:58 +0000 Subject: [PATCH 4/5] Ensure control socket is closed first, and wait for it to close. --- src/server/control.c | 8 ++++++++ src/server/control.h | 1 + src/server/serve.c | 19 +++++++++++++------ 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/src/server/control.c b/src/server/control.c index 394854e..51e58e9 100644 --- a/src/server/control.c +++ b/src/server/control.c @@ -78,6 +78,14 @@ void control_destroy(struct control *control) free(control); } +void control_wait_for_close(struct control *control) +{ + NULLCHECK(control); + while (!fd_is_closed(control->control_fd)) { + usleep(10000); + } +} + struct control_client *control_client_create(struct flexnbd *flexnbd, int client_fd, struct mbox *state_mbox) diff --git a/src/server/control.h b/src/server/control.h index 017611d..551265f 100644 --- a/src/server/control.h +++ b/src/server/control.h @@ -47,6 +47,7 @@ struct control_client { struct control *control_create(struct flexnbd *, const char *control_socket_name); void control_signal_close(struct control *); +void control_wait_for_close(struct control *control); void control_destroy(struct control *); void *control_runner(void *); diff --git a/src/server/serve.c b/src/server/serve.c index 28970aa..c262189 100644 --- a/src/server/serve.c +++ b/src/server/serve.c @@ -820,14 +820,21 @@ void serve_cleanup(struct server *params, void *status; info("cleaning up"); - - if (params->server_fd) { - close(params->server_fd); - } - - /* close the control socket too */ + + /* Close the control socket, and wait for it to close before proceeding. + * If we do not wait, we risk a race condition with the tail supervisor + * sending a status command, and deadlocking the mirroring. */ if (params->flexnbd && params->flexnbd->control) { + debug("closing control socket"); control_signal_close(params->flexnbd->control); + + debug("waiting for control socket to close"); + control_wait_for_close(params->flexnbd->control); + } + + if (params->server_fd) { + debug("closing server_fd"); + close(params->server_fd); } /* need to stop background build if we're killed very early on */ From 654d277453e2818d1e0ded2e25188d9ebff774b5 Mon Sep 17 00:00:00 2001 From: Patrick J Cherry Date: Fri, 7 Dec 2018 16:40:53 +0000 Subject: [PATCH 5/5] Updated changelog --- debian/changelog | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/debian/changelog b/debian/changelog index f115fbf..e79dead 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,10 @@ +flexnbd (0.4.0) UNRELEASED; urgency=medium + + * Explicitly close the server control socket, and wait for it to close, to + prevent deadlocks during the server clean-up process (#40 !58) + + -- Patrick J Cherry Fri, 07 Dec 2018 16:38:56 +0000 + flexnbd (0.4.0) stable; urgency=medium * Ensure proxy state is completely reset before upstream init is read,