Merge branch '40-when-migrating-qemu-discs-from-one-tail-to-another-there-s-an-edge-case-where-flexnbd-freezes-and-the-vm-eventually-disconnects-and-becomes-unstartable' into 'develop'
Resolve "When migrating QEmu discs from one tail to another there's an edge case where flexnbd freezes and the VM eventually disconnects and becomes unstartable." Closes #40 See merge request open-source/flexnbd-c!58
This commit is contained in:
7
debian/changelog
vendored
7
debian/changelog
vendored
@@ -1,3 +1,10 @@
|
|||||||
|
flexnbd (0.4.0) UNRELEASED; urgency=medium
|
||||||
|
|
||||||
|
* Explicitly close the server control socket, and wait for it to close, to
|
||||||
|
prevent deadlocks during the server clean-up process (#40 !58)
|
||||||
|
|
||||||
|
-- Patrick J Cherry <patrick@bytemark.co.uk> Fri, 07 Dec 2018 16:38:56 +0000
|
||||||
|
|
||||||
flexnbd (0.4.0) stable; urgency=medium
|
flexnbd (0.4.0) stable; urgency=medium
|
||||||
|
|
||||||
* Ensure proxy state is completely reset before upstream init is read,
|
* Ensure proxy state is completely reset before upstream init is read,
|
||||||
|
@@ -78,6 +78,14 @@ void control_destroy(struct control *control)
|
|||||||
free(control);
|
free(control);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void control_wait_for_close(struct control *control)
|
||||||
|
{
|
||||||
|
NULLCHECK(control);
|
||||||
|
while (!fd_is_closed(control->control_fd)) {
|
||||||
|
usleep(10000);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
struct control_client *control_client_create(struct flexnbd *flexnbd,
|
struct control_client *control_client_create(struct flexnbd *flexnbd,
|
||||||
int client_fd,
|
int client_fd,
|
||||||
struct mbox *state_mbox)
|
struct mbox *state_mbox)
|
||||||
|
@@ -47,6 +47,7 @@ struct control_client {
|
|||||||
struct control *control_create(struct flexnbd *,
|
struct control *control_create(struct flexnbd *,
|
||||||
const char *control_socket_name);
|
const char *control_socket_name);
|
||||||
void control_signal_close(struct control *);
|
void control_signal_close(struct control *);
|
||||||
|
void control_wait_for_close(struct control *control);
|
||||||
void control_destroy(struct control *);
|
void control_destroy(struct control *);
|
||||||
|
|
||||||
void *control_runner(void *);
|
void *control_runner(void *);
|
||||||
|
@@ -812,8 +812,6 @@ void server_control_arrived(struct server *serve)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void flexnbd_stop_control(struct flexnbd *flexnbd);
|
|
||||||
|
|
||||||
/** Closes sockets, frees memory and waits for all client threads to finish */
|
/** Closes sockets, frees memory and waits for all client threads to finish */
|
||||||
void serve_cleanup(struct server *params,
|
void serve_cleanup(struct server *params,
|
||||||
int fatal __attribute__ ((unused)))
|
int fatal __attribute__ ((unused)))
|
||||||
@@ -822,8 +820,20 @@ void serve_cleanup(struct server *params,
|
|||||||
void *status;
|
void *status;
|
||||||
|
|
||||||
info("cleaning up");
|
info("cleaning up");
|
||||||
|
|
||||||
|
/* Close the control socket, and wait for it to close before proceeding.
|
||||||
|
* If we do not wait, we risk a race condition with the tail supervisor
|
||||||
|
* sending a status command, and deadlocking the mirroring. */
|
||||||
|
if (params->flexnbd && params->flexnbd->control) {
|
||||||
|
debug("closing control socket");
|
||||||
|
control_signal_close(params->flexnbd->control);
|
||||||
|
|
||||||
|
debug("waiting for control socket to close");
|
||||||
|
control_wait_for_close(params->flexnbd->control);
|
||||||
|
}
|
||||||
|
|
||||||
if (params->server_fd) {
|
if (params->server_fd) {
|
||||||
|
debug("closing server_fd");
|
||||||
close(params->server_fd);
|
close(params->server_fd);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -861,15 +871,6 @@ void serve_cleanup(struct server *params,
|
|||||||
server_unlock_acl(params);
|
server_unlock_acl(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* if( params->flexnbd ) { */
|
|
||||||
/* if ( params->flexnbd->control ) { */
|
|
||||||
/* flexnbd_stop_control( params->flexnbd ); */
|
|
||||||
/* } */
|
|
||||||
/* flexnbd_destroy( params->flexnbd ); */
|
|
||||||
/* } */
|
|
||||||
|
|
||||||
/* server_destroy( params ); */
|
|
||||||
|
|
||||||
debug("Cleanup done");
|
debug("Cleanup done");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
46
tests/acceptance/test_write_during_migration.rb
Normal file → Executable file
46
tests/acceptance/test_write_during_migration.rb
Normal file → Executable file
@@ -82,15 +82,14 @@ class TestWriteDuringMigration < Test::Unit::TestCase
|
|||||||
UNIXSocket.open(@source_sock) do |sock|
|
UNIXSocket.open(@source_sock) do |sock|
|
||||||
sock.write(['mirror', '127.0.0.1', @dest_port.to_s, 'exit'].join("\x0A") + "\x0A\x0A")
|
sock.write(['mirror', '127.0.0.1', @dest_port.to_s, 'exit'].join("\x0A") + "\x0A\x0A")
|
||||||
sock.flush
|
sock.flush
|
||||||
rsp = sock.readline
|
sock.readline
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def wait_for_quit
|
def wait_for_quit
|
||||||
Timeout.timeout(10) do
|
Timeout.timeout(10) do
|
||||||
start_time = Time.now
|
Process.waitpid2(@dst_proc)
|
||||||
dst_result = Process.waitpid2(@dst_proc)
|
Process.waitpid2(@src_proc)
|
||||||
src_result = Process.waitpid2(@src_proc)
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
@@ -100,13 +99,28 @@ class TestWriteDuringMigration < Test::Unit::TestCase
|
|||||||
loop do
|
loop do
|
||||||
begin
|
begin
|
||||||
client.write(offsets[rand(offsets.size)] * 4096, @write_data)
|
client.write(offsets[rand(offsets.size)] * 4096, @write_data)
|
||||||
rescue StandardError => err
|
rescue StandardError
|
||||||
# We expect a broken write at some point, so ignore it
|
# We expect a broken write at some point, so ignore it
|
||||||
break
|
break
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def bombard_with_status
|
||||||
|
loop do
|
||||||
|
begin
|
||||||
|
UNIXSocket.open(@source_sock) do |sock|
|
||||||
|
sock.write("status\x0A\x0A")
|
||||||
|
sock.flush
|
||||||
|
sock.readline
|
||||||
|
end
|
||||||
|
rescue StandardError
|
||||||
|
# If the socket disappears, that's OK.
|
||||||
|
break
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
def assert_both_sides_identical
|
def assert_both_sides_identical
|
||||||
# puts `md5sum #{@source_file} #{@dest_file}`
|
# puts `md5sum #{@source_file} #{@dest_file}`
|
||||||
|
|
||||||
@@ -160,5 +174,25 @@ class TestWriteDuringMigration < Test::Unit::TestCase
|
|||||||
(src_writers_1 + src_writers_2).each(&:join)
|
(src_writers_1 + src_writers_2).each(&:join)
|
||||||
assert_both_sides_identical
|
assert_both_sides_identical
|
||||||
end
|
end
|
||||||
end end
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
def test_status_call_after_cleanup
|
||||||
|
Dir.mktmpdir do |tmpdir|
|
||||||
|
Dir.chdir(tmpdir) do
|
||||||
|
make_files
|
||||||
|
|
||||||
|
launch_servers
|
||||||
|
|
||||||
|
status_poker = Thread.new { bombard_with_status }
|
||||||
|
|
||||||
|
start_mirror
|
||||||
|
|
||||||
|
wait_for_quit
|
||||||
|
status_poker.join
|
||||||
|
assert_both_sides_identical
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
Reference in New Issue
Block a user