Merge branch '40-when-migrating-qemu-discs-from-one-tail-to-another-there-s-an-edge-case-where-flexnbd-freezes-and-the-vm-eventually-disconnects-and-becomes-unstartable' into 'develop'
Resolve "When migrating QEmu discs from one tail to another there's an edge case where flexnbd freezes and the VM eventually disconnects and becomes unstartable." Closes #40 See merge request open-source/flexnbd-c!58
This commit is contained in:
7
debian/changelog
vendored
7
debian/changelog
vendored
@@ -1,3 +1,10 @@
|
||||
flexnbd (0.4.0) UNRELEASED; urgency=medium
|
||||
|
||||
* Explicitly close the server control socket, and wait for it to close, to
|
||||
prevent deadlocks during the server clean-up process (#40 !58)
|
||||
|
||||
-- Patrick J Cherry <patrick@bytemark.co.uk> Fri, 07 Dec 2018 16:38:56 +0000
|
||||
|
||||
flexnbd (0.4.0) stable; urgency=medium
|
||||
|
||||
* Ensure proxy state is completely reset before upstream init is read,
|
||||
|
@@ -78,6 +78,14 @@ void control_destroy(struct control *control)
|
||||
free(control);
|
||||
}
|
||||
|
||||
void control_wait_for_close(struct control *control)
|
||||
{
|
||||
NULLCHECK(control);
|
||||
while (!fd_is_closed(control->control_fd)) {
|
||||
usleep(10000);
|
||||
}
|
||||
}
|
||||
|
||||
struct control_client *control_client_create(struct flexnbd *flexnbd,
|
||||
int client_fd,
|
||||
struct mbox *state_mbox)
|
||||
|
@@ -47,6 +47,7 @@ struct control_client {
|
||||
struct control *control_create(struct flexnbd *,
|
||||
const char *control_socket_name);
|
||||
void control_signal_close(struct control *);
|
||||
void control_wait_for_close(struct control *control);
|
||||
void control_destroy(struct control *);
|
||||
|
||||
void *control_runner(void *);
|
||||
|
@@ -812,8 +812,6 @@ void server_control_arrived(struct server *serve)
|
||||
}
|
||||
|
||||
|
||||
void flexnbd_stop_control(struct flexnbd *flexnbd);
|
||||
|
||||
/** Closes sockets, frees memory and waits for all client threads to finish */
|
||||
void serve_cleanup(struct server *params,
|
||||
int fatal __attribute__ ((unused)))
|
||||
@@ -822,8 +820,20 @@ void serve_cleanup(struct server *params,
|
||||
void *status;
|
||||
|
||||
info("cleaning up");
|
||||
|
||||
/* Close the control socket, and wait for it to close before proceeding.
|
||||
* If we do not wait, we risk a race condition with the tail supervisor
|
||||
* sending a status command, and deadlocking the mirroring. */
|
||||
if (params->flexnbd && params->flexnbd->control) {
|
||||
debug("closing control socket");
|
||||
control_signal_close(params->flexnbd->control);
|
||||
|
||||
debug("waiting for control socket to close");
|
||||
control_wait_for_close(params->flexnbd->control);
|
||||
}
|
||||
|
||||
if (params->server_fd) {
|
||||
debug("closing server_fd");
|
||||
close(params->server_fd);
|
||||
}
|
||||
|
||||
@@ -861,15 +871,6 @@ void serve_cleanup(struct server *params,
|
||||
server_unlock_acl(params);
|
||||
}
|
||||
|
||||
/* if( params->flexnbd ) { */
|
||||
/* if ( params->flexnbd->control ) { */
|
||||
/* flexnbd_stop_control( params->flexnbd ); */
|
||||
/* } */
|
||||
/* flexnbd_destroy( params->flexnbd ); */
|
||||
/* } */
|
||||
|
||||
/* server_destroy( params ); */
|
||||
|
||||
debug("Cleanup done");
|
||||
}
|
||||
|
||||
|
46
tests/acceptance/test_write_during_migration.rb
Normal file → Executable file
46
tests/acceptance/test_write_during_migration.rb
Normal file → Executable file
@@ -82,15 +82,14 @@ class TestWriteDuringMigration < Test::Unit::TestCase
|
||||
UNIXSocket.open(@source_sock) do |sock|
|
||||
sock.write(['mirror', '127.0.0.1', @dest_port.to_s, 'exit'].join("\x0A") + "\x0A\x0A")
|
||||
sock.flush
|
||||
rsp = sock.readline
|
||||
sock.readline
|
||||
end
|
||||
end
|
||||
|
||||
def wait_for_quit
|
||||
Timeout.timeout(10) do
|
||||
start_time = Time.now
|
||||
dst_result = Process.waitpid2(@dst_proc)
|
||||
src_result = Process.waitpid2(@src_proc)
|
||||
Process.waitpid2(@dst_proc)
|
||||
Process.waitpid2(@src_proc)
|
||||
end
|
||||
end
|
||||
|
||||
@@ -100,13 +99,28 @@ class TestWriteDuringMigration < Test::Unit::TestCase
|
||||
loop do
|
||||
begin
|
||||
client.write(offsets[rand(offsets.size)] * 4096, @write_data)
|
||||
rescue StandardError => err
|
||||
rescue StandardError
|
||||
# We expect a broken write at some point, so ignore it
|
||||
break
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def bombard_with_status
|
||||
loop do
|
||||
begin
|
||||
UNIXSocket.open(@source_sock) do |sock|
|
||||
sock.write("status\x0A\x0A")
|
||||
sock.flush
|
||||
sock.readline
|
||||
end
|
||||
rescue StandardError
|
||||
# If the socket disappears, that's OK.
|
||||
break
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def assert_both_sides_identical
|
||||
# puts `md5sum #{@source_file} #{@dest_file}`
|
||||
|
||||
@@ -160,5 +174,25 @@ class TestWriteDuringMigration < Test::Unit::TestCase
|
||||
(src_writers_1 + src_writers_2).each(&:join)
|
||||
assert_both_sides_identical
|
||||
end
|
||||
end end
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
def test_status_call_after_cleanup
|
||||
Dir.mktmpdir do |tmpdir|
|
||||
Dir.chdir(tmpdir) do
|
||||
make_files
|
||||
|
||||
launch_servers
|
||||
|
||||
status_poker = Thread.new { bombard_with_status }
|
||||
|
||||
start_mirror
|
||||
|
||||
wait_for_quit
|
||||
status_poker.join
|
||||
assert_both_sides_identical
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
Reference in New Issue
Block a user