Merge branch '40-when-migrating-qemu-discs-from-one-tail-to-another-there-s-an-edge-case-where-flexnbd-freezes-and-the-vm-eventually-disconnects-and-becomes-unstartable' into 'develop'

Resolve "When migrating QEmu discs from one tail to another there's an edge case where flexnbd freezes and the VM eventually disconnects and becomes unstartable."

Closes #40

See merge request open-source/flexnbd-c!58
This commit is contained in:
James Carter
2018-12-07 17:12:21 +00:00
5 changed files with 68 additions and 17 deletions

7
debian/changelog vendored
View File

@@ -1,3 +1,10 @@
flexnbd (0.4.0) UNRELEASED; urgency=medium
* Explicitly close the server control socket, and wait for it to close, to
prevent deadlocks during the server clean-up process (#40 !58)
-- Patrick J Cherry <patrick@bytemark.co.uk> Fri, 07 Dec 2018 16:38:56 +0000
flexnbd (0.4.0) stable; urgency=medium
* Ensure proxy state is completely reset before upstream init is read,

View File

@@ -78,6 +78,14 @@ void control_destroy(struct control *control)
free(control);
}
void control_wait_for_close(struct control *control)
{
NULLCHECK(control);
while (!fd_is_closed(control->control_fd)) {
usleep(10000);
}
}
struct control_client *control_client_create(struct flexnbd *flexnbd,
int client_fd,
struct mbox *state_mbox)

View File

@@ -47,6 +47,7 @@ struct control_client {
struct control *control_create(struct flexnbd *,
const char *control_socket_name);
void control_signal_close(struct control *);
void control_wait_for_close(struct control *control);
void control_destroy(struct control *);
void *control_runner(void *);

View File

@@ -812,8 +812,6 @@ void server_control_arrived(struct server *serve)
}
void flexnbd_stop_control(struct flexnbd *flexnbd);
/** Closes sockets, frees memory and waits for all client threads to finish */
void serve_cleanup(struct server *params,
int fatal __attribute__ ((unused)))
@@ -823,7 +821,19 @@ void serve_cleanup(struct server *params,
info("cleaning up");
/* Close the control socket, and wait for it to close before proceeding.
* If we do not wait, we risk a race condition with the tail supervisor
* sending a status command, and deadlocking the mirroring. */
if (params->flexnbd && params->flexnbd->control) {
debug("closing control socket");
control_signal_close(params->flexnbd->control);
debug("waiting for control socket to close");
control_wait_for_close(params->flexnbd->control);
}
if (params->server_fd) {
debug("closing server_fd");
close(params->server_fd);
}
@@ -861,15 +871,6 @@ void serve_cleanup(struct server *params,
server_unlock_acl(params);
}
/* if( params->flexnbd ) { */
/* if ( params->flexnbd->control ) { */
/* flexnbd_stop_control( params->flexnbd ); */
/* } */
/* flexnbd_destroy( params->flexnbd ); */
/* } */
/* server_destroy( params ); */
debug("Cleanup done");
}

46
tests/acceptance/test_write_during_migration.rb Normal file → Executable file
View File

@@ -82,15 +82,14 @@ class TestWriteDuringMigration < Test::Unit::TestCase
UNIXSocket.open(@source_sock) do |sock|
sock.write(['mirror', '127.0.0.1', @dest_port.to_s, 'exit'].join("\x0A") + "\x0A\x0A")
sock.flush
rsp = sock.readline
sock.readline
end
end
def wait_for_quit
Timeout.timeout(10) do
start_time = Time.now
dst_result = Process.waitpid2(@dst_proc)
src_result = Process.waitpid2(@src_proc)
Process.waitpid2(@dst_proc)
Process.waitpid2(@src_proc)
end
end
@@ -100,13 +99,28 @@ class TestWriteDuringMigration < Test::Unit::TestCase
loop do
begin
client.write(offsets[rand(offsets.size)] * 4096, @write_data)
rescue StandardError => err
rescue StandardError
# We expect a broken write at some point, so ignore it
break
end
end
end
def bombard_with_status
loop do
begin
UNIXSocket.open(@source_sock) do |sock|
sock.write("status\x0A\x0A")
sock.flush
sock.readline
end
rescue StandardError
# If the socket disappears, that's OK.
break
end
end
end
def assert_both_sides_identical
# puts `md5sum #{@source_file} #{@dest_file}`
@@ -160,5 +174,25 @@ class TestWriteDuringMigration < Test::Unit::TestCase
(src_writers_1 + src_writers_2).each(&:join)
assert_both_sides_identical
end
end end
end
end
def test_status_call_after_cleanup
Dir.mktmpdir do |tmpdir|
Dir.chdir(tmpdir) do
make_files
launch_servers
status_poker = Thread.new { bombard_with_status }
start_mirror
wait_for_quit
status_poker.join
assert_both_sides_identical
end
end
end
end