From ce9499efce3b7ccceb5d5c4fc9f2eb75f830a782 Mon Sep 17 00:00:00 2001 From: Patrick J Cherry Date: Fri, 7 Dec 2018 13:59:49 +0000 Subject: [PATCH 01/11] Rubocop; add test to bombard a migration source with status commands --- .../acceptance/test_write_during_migration.rb | 46 ++++++++++++++++--- 1 file changed, 40 insertions(+), 6 deletions(-) mode change 100644 => 100755 tests/acceptance/test_write_during_migration.rb diff --git a/tests/acceptance/test_write_during_migration.rb b/tests/acceptance/test_write_during_migration.rb old mode 100644 new mode 100755 index dd259c5..053fcc2 --- a/tests/acceptance/test_write_during_migration.rb +++ b/tests/acceptance/test_write_during_migration.rb @@ -82,15 +82,14 @@ class TestWriteDuringMigration < Test::Unit::TestCase UNIXSocket.open(@source_sock) do |sock| sock.write(['mirror', '127.0.0.1', @dest_port.to_s, 'exit'].join("\x0A") + "\x0A\x0A") sock.flush - rsp = sock.readline + sock.readline end end def wait_for_quit Timeout.timeout(10) do - start_time = Time.now - dst_result = Process.waitpid2(@dst_proc) - src_result = Process.waitpid2(@src_proc) + Process.waitpid2(@dst_proc) + Process.waitpid2(@src_proc) end end @@ -100,13 +99,28 @@ class TestWriteDuringMigration < Test::Unit::TestCase loop do begin client.write(offsets[rand(offsets.size)] * 4096, @write_data) - rescue StandardError => err + rescue StandardError # We expect a broken write at some point, so ignore it break end end end + def bombard_with_status + loop do + begin + UNIXSocket.open(@source_sock) do |sock| + sock.write("status\x0A\x0A") + sock.flush + sock.readline + end + rescue Errno::ENOENT + # If the socket disappears, that's OK. + break + end + end + end + def assert_both_sides_identical # puts `md5sum #{@source_file} #{@dest_file}` @@ -160,5 +174,25 @@ class TestWriteDuringMigration < Test::Unit::TestCase (src_writers_1 + src_writers_2).each(&:join) assert_both_sides_identical end - end end + end + end + + + def test_status_call_after_cleanup + Dir.mktmpdir do |tmpdir| + Dir.chdir(tmpdir) do + make_files + + launch_servers + + status_poker = Thread.new { bombard_with_status } + + start_mirror + + wait_for_quit + status_poker.join + assert_both_sides_identical + end + end + end end From 70a3a4bb55e621947f2eac4af99f5a6c41dcd03a Mon Sep 17 00:00:00 2001 From: Patrick J Cherry Date: Fri, 7 Dec 2018 15:02:55 +0000 Subject: [PATCH 02/11] Close the control socket during cleanup This should prevent further requests coming in, triggering deadlocks. --- src/server/serve.c | 14 +++++--------- tests/acceptance/test_write_during_migration.rb | 2 +- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/server/serve.c b/src/server/serve.c index e5cc588..af7afb4 100644 --- a/src/server/serve.c +++ b/src/server/serve.c @@ -827,6 +827,11 @@ void serve_cleanup(struct server *params, close(params->server_fd); } + /* close the control socket too */ + if (params->flexnbd && params->flexnbd->control) { + control_signal_close(params->flexnbd->control); + } + /* need to stop background build if we're killed very early on */ pthread_cancel(params->allocation_map_builder_thread); pthread_join(params->allocation_map_builder_thread, &status); @@ -861,15 +866,6 @@ void serve_cleanup(struct server *params, server_unlock_acl(params); } - /* if( params->flexnbd ) { */ - /* if ( params->flexnbd->control ) { */ - /* flexnbd_stop_control( params->flexnbd ); */ - /* } */ - /* flexnbd_destroy( params->flexnbd ); */ - /* } */ - - /* server_destroy( params ); */ - debug("Cleanup done"); } diff --git a/tests/acceptance/test_write_during_migration.rb b/tests/acceptance/test_write_during_migration.rb index 053fcc2..70ef9e6 100755 --- a/tests/acceptance/test_write_during_migration.rb +++ b/tests/acceptance/test_write_during_migration.rb @@ -114,7 +114,7 @@ class TestWriteDuringMigration < Test::Unit::TestCase sock.flush sock.readline end - rescue Errno::ENOENT + rescue StandardError # If the socket disappears, that's OK. break end From 5839a36ab10ed60411ab6e1c0c5069109d6b9084 Mon Sep 17 00:00:00 2001 From: Patrick J Cherry Date: Fri, 7 Dec 2018 15:05:19 +0000 Subject: [PATCH 03/11] Remove useless function definition --- src/server/serve.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/server/serve.c b/src/server/serve.c index af7afb4..28970aa 100644 --- a/src/server/serve.c +++ b/src/server/serve.c @@ -812,8 +812,6 @@ void server_control_arrived(struct server *serve) } -void flexnbd_stop_control(struct flexnbd *flexnbd); - /** Closes sockets, frees memory and waits for all client threads to finish */ void serve_cleanup(struct server *params, int fatal __attribute__ ((unused))) From 842e7d362d0c6231ba71341b07b51dc5cdcc0914 Mon Sep 17 00:00:00 2001 From: Patrick J Cherry Date: Fri, 7 Dec 2018 16:32:58 +0000 Subject: [PATCH 04/11] Ensure control socket is closed first, and wait for it to close. --- src/server/control.c | 8 ++++++++ src/server/control.h | 1 + src/server/serve.c | 19 +++++++++++++------ 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/src/server/control.c b/src/server/control.c index 394854e..51e58e9 100644 --- a/src/server/control.c +++ b/src/server/control.c @@ -78,6 +78,14 @@ void control_destroy(struct control *control) free(control); } +void control_wait_for_close(struct control *control) +{ + NULLCHECK(control); + while (!fd_is_closed(control->control_fd)) { + usleep(10000); + } +} + struct control_client *control_client_create(struct flexnbd *flexnbd, int client_fd, struct mbox *state_mbox) diff --git a/src/server/control.h b/src/server/control.h index 017611d..551265f 100644 --- a/src/server/control.h +++ b/src/server/control.h @@ -47,6 +47,7 @@ struct control_client { struct control *control_create(struct flexnbd *, const char *control_socket_name); void control_signal_close(struct control *); +void control_wait_for_close(struct control *control); void control_destroy(struct control *); void *control_runner(void *); diff --git a/src/server/serve.c b/src/server/serve.c index 28970aa..c262189 100644 --- a/src/server/serve.c +++ b/src/server/serve.c @@ -820,14 +820,21 @@ void serve_cleanup(struct server *params, void *status; info("cleaning up"); - - if (params->server_fd) { - close(params->server_fd); - } - - /* close the control socket too */ + + /* Close the control socket, and wait for it to close before proceeding. + * If we do not wait, we risk a race condition with the tail supervisor + * sending a status command, and deadlocking the mirroring. */ if (params->flexnbd && params->flexnbd->control) { + debug("closing control socket"); control_signal_close(params->flexnbd->control); + + debug("waiting for control socket to close"); + control_wait_for_close(params->flexnbd->control); + } + + if (params->server_fd) { + debug("closing server_fd"); + close(params->server_fd); } /* need to stop background build if we're killed very early on */ From 654d277453e2818d1e0ded2e25188d9ebff774b5 Mon Sep 17 00:00:00 2001 From: Patrick J Cherry Date: Fri, 7 Dec 2018 16:40:53 +0000 Subject: [PATCH 05/11] Updated changelog --- debian/changelog | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/debian/changelog b/debian/changelog index f115fbf..e79dead 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,10 @@ +flexnbd (0.4.0) UNRELEASED; urgency=medium + + * Explicitly close the server control socket, and wait for it to close, to + prevent deadlocks during the server clean-up process (#40 !58) + + -- Patrick J Cherry Fri, 07 Dec 2018 16:38:56 +0000 + flexnbd (0.4.0) stable; urgency=medium * Ensure proxy state is completely reset before upstream init is read, From a4d641b215bbec991854b04a656561abd7190ddd Mon Sep 17 00:00:00 2001 From: Patrick J Cherry Date: Fri, 7 Dec 2018 21:47:14 +0000 Subject: [PATCH 06/11] Ensure ev abandon_watcher is stopped before reuse. --- src/server/mirror.c | 1 + .../acceptance/test_write_during_migration.rb | 26 +++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/src/server/mirror.c b/src/server/mirror.c index b1d917b..75bf3f5 100644 --- a/src/server/mirror.c +++ b/src/server/mirror.c @@ -671,6 +671,7 @@ static void mirror_abandon_cb(struct ev_loop *loop, ev_io * w, int revents) debug("Abandon message received"); mirror_set_state(ctrl->mirror, MS_ABANDONED); self_pipe_signal_clear(ctrl->mirror->abandon_signal); + ev_io_stop(loop, &ctrl->abandon_watcher); ev_break(loop, EVBREAK_ONE); return; } diff --git a/tests/acceptance/test_write_during_migration.rb b/tests/acceptance/test_write_during_migration.rb index 70ef9e6..bbe41ff 100755 --- a/tests/acceptance/test_write_during_migration.rb +++ b/tests/acceptance/test_write_during_migration.rb @@ -86,6 +86,14 @@ class TestWriteDuringMigration < Test::Unit::TestCase end end + def stop_mirror + UNIXSocket.open(@source_sock) do |sock| + sock.write("break\x0A\x0A") + sock.flush + sock.readline + end + end + def wait_for_quit Timeout.timeout(10) do Process.waitpid2(@dst_proc) @@ -195,4 +203,22 @@ class TestWriteDuringMigration < Test::Unit::TestCase end end end + + def test_mirroring_can_be_restarted + Dir.mktmpdir do |tmpdir| + Dir.chdir(tmpdir) do + make_files + + launch_servers + + 3.times do + start_mirror + stop_mirror + end + start_mirror + + wait_for_quit + end + end + end end From 52690f5382491514c3f854e8dc32e8bfbe44d31a Mon Sep 17 00:00:00 2001 From: Patrick J Cherry Date: Fri, 7 Dec 2018 21:48:55 +0000 Subject: [PATCH 07/11] Updated changelog --- debian/changelog | 2 ++ 1 file changed, 2 insertions(+) diff --git a/debian/changelog b/debian/changelog index e79dead..0f632cf 100644 --- a/debian/changelog +++ b/debian/changelog @@ -2,6 +2,8 @@ flexnbd (0.4.0) UNRELEASED; urgency=medium * Explicitly close the server control socket, and wait for it to close, to prevent deadlocks during the server clean-up process (#40 !58) + * Ensure mirroring can be restarted after a break command is sent to the + source (#38, !59) -- Patrick J Cherry Fri, 07 Dec 2018 16:38:56 +0000 From 39400f2e09d933b73349abcea43bf93ec39760c0 Mon Sep 17 00:00:00 2001 From: Patrick J Cherry Date: Fri, 7 Dec 2018 21:50:56 +0000 Subject: [PATCH 08/11] Fixed issue number. --- debian/changelog | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debian/changelog b/debian/changelog index 0f632cf..6c33cdc 100644 --- a/debian/changelog +++ b/debian/changelog @@ -3,7 +3,7 @@ flexnbd (0.4.0) UNRELEASED; urgency=medium * Explicitly close the server control socket, and wait for it to close, to prevent deadlocks during the server clean-up process (#40 !58) * Ensure mirroring can be restarted after a break command is sent to the - source (#38, !59) + source (#37, !59) -- Patrick J Cherry Fri, 07 Dec 2018 16:38:56 +0000 From e5133a50bdf05eac2f26393c9c574ab9568fe033 Mon Sep 17 00:00:00 2001 From: Patrick J Cherry Date: Fri, 7 Dec 2018 22:09:21 +0000 Subject: [PATCH 09/11] Slow down start/stop. Enable DEBUG Trying to working why this is failing in gitlab-ci --- tests/acceptance/test_write_during_migration.rb | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/acceptance/test_write_during_migration.rb b/tests/acceptance/test_write_during_migration.rb index bbe41ff..83cc6d1 100755 --- a/tests/acceptance/test_write_during_migration.rb +++ b/tests/acceptance/test_write_during_migration.rb @@ -185,7 +185,6 @@ class TestWriteDuringMigration < Test::Unit::TestCase end end - def test_status_call_after_cleanup Dir.mktmpdir do |tmpdir| Dir.chdir(tmpdir) do @@ -209,11 +208,15 @@ class TestWriteDuringMigration < Test::Unit::TestCase Dir.chdir(tmpdir) do make_files + ENV['DEBUG'] = '1' launch_servers + ENV.delete 'DEBUG' 3.times do start_mirror + sleep 0.1 stop_mirror + sleep 0.1 end start_mirror From 6b1a877dc3f6b29ab02eeb2ab306098647d3d946 Mon Sep 17 00:00:00 2001 From: Patrick J Cherry Date: Fri, 7 Dec 2018 22:43:50 +0000 Subject: [PATCH 10/11] Tweaked test file size, and removed debug ENV fiddling --- tests/acceptance/test_write_during_migration.rb | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/acceptance/test_write_during_migration.rb b/tests/acceptance/test_write_during_migration.rb index 83cc6d1..96ba97d 100755 --- a/tests/acceptance/test_write_during_migration.rb +++ b/tests/acceptance/test_write_during_migration.rb @@ -204,14 +204,16 @@ class TestWriteDuringMigration < Test::Unit::TestCase end def test_mirroring_can_be_restarted + @size = 100 * 1024 * 1024 # 100MB Dir.mktmpdir do |tmpdir| Dir.chdir(tmpdir) do make_files - ENV['DEBUG'] = '1' launch_servers - ENV.delete 'DEBUG' + # This is a bit racy. It needs to be slow enough that the migration + # isn't finished before the stop runs, and slow enough so that we can + # stop/start a few times. 3.times do start_mirror sleep 0.1 From 3448ff15b88a680ea3b262c1ea22d7cea04b6ae0 Mon Sep 17 00:00:00 2001 From: "James F. Carter" Date: Fri, 11 Jan 2019 10:37:48 +0000 Subject: [PATCH 11/11] release 0.5.0 --- debian/changelog | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/debian/changelog b/debian/changelog index 6c33cdc..53392a2 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,11 +1,12 @@ -flexnbd (0.4.0) UNRELEASED; urgency=medium +flexnbd (0.5.0) stable; urgency=medium + [ Patrick J Cherry ] * Explicitly close the server control socket, and wait for it to close, to prevent deadlocks during the server clean-up process (#40 !58) * Ensure mirroring can be restarted after a break command is sent to the source (#37, !59) - -- Patrick J Cherry Fri, 07 Dec 2018 16:38:56 +0000 + -- James Carter Fri, 11 Jan 2019 10:37:23 +0000 flexnbd (0.4.0) stable; urgency=medium