Fix for deadlocks when writing while migrating

This commit is contained in:
Alex Young
2012-09-13 12:21:43 +01:00
parent 13328910c8
commit 8b43321ef2
3 changed files with 100 additions and 39 deletions

View File

@@ -579,7 +579,7 @@ void server_close_clients( struct server *params )
info("closing all clients"); info("closing all clients");
int i, j; int i; /* , j; */
struct client_tbl_entry *entry; struct client_tbl_entry *entry;
for( i = 0; i < params->max_nbd_clients; i++ ) { for( i = 0; i < params->max_nbd_clients; i++ ) {
@@ -590,9 +590,17 @@ void server_close_clients( struct server *params )
client_signal_stop( entry->client ); client_signal_stop( entry->client );
} }
} }
for( j = 0; j < params->max_nbd_clients; j++ ) { /* We don't join the clients here. When we enter the final
join_client_thread( &params->nbd_client[j] ); * mirror pass, we get the IO lock, then wait for the server_fd
} * to close before sending the data, to be sure that no new
* clients can be accepted which might think they've written
* to the disc. However, an existing client thread can be
* waiting for the IO lock already, so if we try to join it
* here, we deadlock.
*
* The client threads will be joined in serve_cleanup.
*
*/
} }

View File

@@ -1,60 +1,102 @@
#!/usr/bin/env ruby #!/usr/bin/env ruby
require 'rubygems' require 'rubygems'
require 'linux/nbd_client' require 'flexnbd/fake_source'
require 'socket' require 'socket'
require 'fileutils' require 'fileutils'
FLEXNBD = ARGV.shift
raise "No binary!" unless File.executable?( FLEXNBD )
raise "No trickle!" unless system "which trickle > /dev/null"
Thread.abort_on_exception = true Thread.abort_on_exception = true
SIZE = 1024*1024*1024*10 # 10G
#BREAKIT = false
BREAKIT = true
MB = 1024*1024
GB = 1024*MB
SIZE = 20*MB
WRITE_DATA = "foo!" * 2048 # 8K write WRITE_DATA = "foo!" * 2048 # 8K write
SOURCE_PORT = 9990
DEST_PORT = 9991
SOURCE_SOCK = "src.sock"
DEST_SOCK = "dst.sock"
SOURCE_FILE = "src.file"
DEST_FILE = "dst.file"
puts "Cleaning up from old runs..." puts "Cleaning up from old runs..."
FileUtils.rm_f "src*" FileUtils.rm_f "src*"
FileUtils.rm_f "dst*" FileUtils.rm_f "dst*"
puts "Creating source & destination files..." puts "Creating source & destination files..."
FileUtils.touch("src.file") FileUtils.touch(SOURCE_FILE)
File.truncate("src.file", SIZE) File.truncate(SOURCE_FILE, SIZE)
FileUtils.touch("dst.file") FileUtils.touch(DEST_FILE)
File.truncate("dst.file", SIZE) File.truncate(DEST_FILE, SIZE)
puts "Making filesystem on source..." puts "Making source data"
result = Kernel.system "/sbin/mkfs.ext3 -F src.file" # some data to start with File.open(SOURCE_FILE, "wb"){|f| f.write "a"*SIZE }
puts "Result of making filesystem: #{result.inspect}" #Kernel.system "dd if=/dev/urandom of=#{SOURCE_FILE} bs=#{SIZE} count=1"
puts "Starting destination process..." puts "Starting destination process..."
dst_proc = fork() { dst_proc = fork() {
exec("flexnbd listen -l 127.0.0.1 -p 9991 -f dst.file -s dst.sock -v >dst.stdout 2>dst.stderr") cmd = "#{FLEXNBD} listen -l 127.0.0.1 -p #{DEST_PORT} -f #{DEST_FILE} -s #{DEST_SOCK} -v"
exec cmd
} }
puts "Starting source process..." puts "Starting source process..."
src_proc = fork() { src_proc = fork() {
exec("flexnbd serve -l 127.0.0.1 -p 9990 -f src.file -s src.sock -v >src.stdout 2>src.stderr") cmd = "#{FLEXNBD} serve -l 127.0.0.1 -p #{SOURCE_PORT} -f #{SOURCE_FILE} -s #{SOURCE_SOCK} -v"
#exec "trickle -t 1 -u 5120 -s #{cmd}"
exec cmd
} }
puts "dst_proc is #{dst_proc}"
puts "src_proc is #{src_proc}"
at_exit {
[dst_proc, src_proc].each do |pid|
Process.kill( "KILL", pid ) rescue nil
end
[SOURCE_FILE, DEST_FILE, SOURCE_SOCK, DEST_SOCK].each do |filename|
FileUtils.rm_f filename
end
}
puts "Sleeping to let flexnbds run..." puts "Sleeping to let flexnbds run..."
sleep 10 Timeout.timeout(10) do
sleep 0.1 while !File.exists?( SOURCE_SOCK )
puts "Got source sock"
sleep 0.1 while !File.exists?( DEST_SOCK )
puts "Got dest sock"
end
if BREAKIT
# Start writing to the source
puts "Creating thread to write to the source..."
# Start writing to the source src_writer = Thread.new do
puts "Creating thread to write to the source..." client = FlexNBD::FakeSource.new( "127.0.0.1", SOURCE_PORT, "Timed out connecting" )
src_writer = Thread.new do loop do
client = Linux::NbdClient.new("127.0.0.1", 9990, true) begin
loop do client.write(0, WRITE_DATA)
begin rescue => err
client.write(0, WRITE_DATA) puts "Writer encountered #{err.inspect} writing to source"
rescue => err break
puts "Writer encountered #{err.inspect} writing to source" end
break
end end
end end
end end
puts "Starting mirroring process..." puts "Starting mirroring process..."
UNIXSocket.open("src.sock") {|sock| UNIXSocket.open(SOURCE_SOCK) {|sock|
sock.write(["mirror", "127.0.0.1", "9991", "unlink"].join("\x0A") + "\x0A\x0A") sock.write(["mirror", "127.0.0.1", DEST_PORT.to_s, "unlink"].join("\x0A") + "\x0A\x0A")
sock.flush sock.flush
rsp = sock.readline rsp = sock.readline
puts "Response: #{rsp}" puts "Response: #{rsp}"
@@ -62,15 +104,21 @@ UNIXSocket.open("src.sock") {|sock|
# tell serve to migrate to the listen one # tell serve to migrate to the listen one
puts "Waiting for destination to exit..." Timeout.timeout( 10 ) do
dst_result = Process::waitpid2(dst_proc) start_time = Time.now
puts "destination exited: #{dst_result.inspect}" puts "Waiting for destination #{dst_proc} to exit..."
dst_result = Process::waitpid2(dst_proc)
puts "destination exited: #{dst_result.inspect}"
puts "Waiting for source to exit..." puts "Waiting for source to exit..."
src_result = Process::waitpid2(src_proc) src_result = Process::waitpid2(src_proc)
puts "Source exited: #{src_result.inspect}" puts "Source exited after #{Time.now - start_time} seconds: #{src_result.inspect}"
end
puts "Waiting for writer to die..."
src_writer.join if BREAKIT
puts "Writer has died" puts "Waiting for writer to die..."
src_writer.join
puts "Writer has died"
end

View File

@@ -90,14 +90,19 @@ module FlexNBD
def send_mirror def send_mirror
read_hello() read_hello()
write_write_request( 0, 8 ) write( 0, "12345678" )
write_data( "12345678" )
read_response() read_response()
write_disconnect_request() write_disconnect_request()
close() close()
end end
def write( from, data )
write_write_request( from, data.length )
write_data( data )
end
def read_response def read_response
magic = @sock.read(4) magic = @sock.read(4)
error_s = @sock.read(4) error_s = @sock.read(4)