Retry failed rebind attempts

When we receive a migration, if rebinding to the new listen address and
port fails for a reason which might be fixable, rather than killing the
server we retry once a second.  Also in this patch: non-overlapping log
messages and a fix for the client going away halfway through a sendfile
loop.
This commit is contained in:
Alex Young
2012-07-12 14:14:46 +01:00
parent 9002341e77
commit 10b46beeea
12 changed files with 194 additions and 20 deletions

View File

@@ -5,7 +5,7 @@ require 'file_writer'
class Environment
attr_reader( :blocksize, :filename1, :filename2, :ip,
:port1, :port2, :nbd1, :nbd2, :file1, :file2 )
:port1, :port2, :nbd1, :nbd2, :file1, :file2, :rebind_port1 )
def initialize
@blocksize = 1024
@@ -14,9 +14,11 @@ class Environment
@ip = "127.0.0.1"
@available_ports = [*40000..41000] - listening_ports
@port1 = @available_ports.shift
@rebind_port1 = @available_ports.shift
@port2 = @available_ports.shift
@nbd1 = FlexNBD.new("../../build/flexnbd", @ip, @port1)
@nbd2 = FlexNBD.new("../../build/flexnbd", @ip, @port2)
@rebind_port2 = @available_ports.shift
@nbd1 = FlexNBD.new("../../build/flexnbd", @ip, @port1, @ip, @rebind_port1)
@nbd2 = FlexNBD.new("../../build/flexnbd", @ip, @port2, @ip, @rebind_port2)
@fake_pid = nil
end
@@ -95,6 +97,7 @@ class Environment
end
@nbd1.can_die(0)
@nbd1.kill
@nbd2.kill
@@ -104,7 +107,7 @@ class Environment
end
def run_fake( name, addr, port )
def run_fake( name, addr, port, rebind_addr = addr, rebind_port = port )
fakedir = File.join( File.dirname( __FILE__ ), "fakes" )
fake = Dir[File.join( fakedir, name ) + "*"].sort.find { |fn|
File.executable?( fn )
@@ -113,8 +116,11 @@ class Environment
raise "no fake executable" unless fake
raise "no addr" unless addr
raise "no port" unless port
raise "no rebind_addr" unless rebind_addr
raise "no rebind_port" unless rebind_port
@fake_pid = fork do
exec fake + " " + addr.to_s + " " + port.to_s + " " + @nbd1.pid.to_s
exec [fake, addr, port, @nbd1.pid, rebind_addr, rebind_port].map{|x| x.to_s}.join(" ")
end
sleep(0.5)
end

View File

@@ -24,5 +24,6 @@ client.close
client2 = server.accept
client2.receive_mirror
exit(0)

View File

@@ -9,7 +9,7 @@
require 'flexnbd/fake_source'
include FlexNBD
addr, port, srv_pid = *ARGV
addr, port, srv_pid, rebind_addr, rebind_port = *ARGV
client = FakeSource.new( addr, port, "Timed out connecting" )
client.read_hello
@@ -25,8 +25,8 @@ sleep(0.25)
client2 = FakeSource.new( addr, port, "Timed out reconnecting to mirror" )
client2.send_mirror
sleep(0.25)
client3 = FakeSource.new( addr, port, "Timed out reconnecting to read" )
sleep(1)
client3 = FakeSource.new( rebind_addr, rebind_port, "Timed out reconnecting to read" )
client3.close
exit(0)

View File

@@ -0,0 +1,17 @@
#!/usr/bin/env ruby
# Connect, but get the protocol wrong: don't read the hello, so we
# close and break the sendfile.
require 'flexnbd/fake_source'
include FlexNBD
addr, port, srv_pid, newaddr, newport = *ARGV
client = FakeSource.new( addr, port, "Timed out connecting" )
client.write_read_request( 0, 8 )
client.read_raw( 4 )
client.close
exit(0)

View File

@@ -0,0 +1,29 @@
#!/usr/bin/env ruby
# Successfully send a migration, but squat on the IP and port which
# the destination wants to rebind to. The destination should retry
# every second, so we give it up then attempt to connect to the new
# server.
require 'flexnbd/fake_source'
include FlexNBD
addr, port, srv_pid, newaddr, newport = *ARGV
squatter = TCPServer.open( newaddr, newport.to_i )
client = FakeSource.new( addr, port, "Timed out connecting" )
client.send_mirror()
sleep(1)
squatter.close()
sleep(1)
client2 = FakeSource.new( newaddr, newport.to_i, "Timed out reconnecting" )
client2.read_hello
client2.read( 0, 8 )
client2.close
exit( 0 )

View File

@@ -166,7 +166,7 @@ end # class ValgrindExecutor
# Noddy test class to exercise FlexNBD from the outside for testing.
#
class FlexNBD
attr_reader :bin, :ctrl, :pid, :ip, :port
attr_reader :bin, :ctrl, :pid, :ip, :port, :rebind_ip, :rebind_port
class << self
def counter
@@ -187,7 +187,7 @@ class FlexNBD
end
def initialize(bin, ip, port)
def initialize(bin, ip, port, rebind_ip = ip, rebind_port = port)
@bin = bin
@debug = (ENV['DEBUG'] && `#{@bin} serve --help` =~ /--verbose/) ? "--verbose" : ""
raise "#{bin} not executable" unless File.executable?(bin)
@@ -195,6 +195,8 @@ class FlexNBD
@ctrl = "/tmp/.flexnbd.ctrl.#{Time.now.to_i}.#{rand}"
@ip = ip
@port = port
@rebind_ip = rebind_ip
@rebind_port = rebind_port
@kill = []
end
@@ -224,6 +226,8 @@ class FlexNBD
"--addr #{ip} "\
"--port #{port} "\
"--file #{file} "\
"--rebind-addr #{rebind_ip} " \
"--rebind-port #{rebind_port} " \
"--sock #{ctrl} "\
"#{@debug} "\
"#{acl.join(' ')}"

View File

@@ -54,12 +54,28 @@ module FlexNBD
send_request( 2, handle )
end
def write_read_request( from, len, handle="myhandle" )
send_request( 0, "myhandle", from, len )
end
def write_data( data )
@sock.write( data )
end
# Handy utility
def read( from, len )
timing_out( 2, "Timed out reading" ) do
send_request( 0, "myhandle", from, len )
read_raw( len )
end
end
def read_raw( len )
@sock.read( len )
end
def send_mirror
read_hello()
write_write_request( 0, 8 )

View File

@@ -34,6 +34,10 @@ class TestDestErrorHandling < Test::Unit::TestCase
end
def test_partial_read_causes_error
run_fake( "source/close_mid_read" )
end
def test_double_connect_during_hello
run_fake( "source/connect_during_hello" )
end
@@ -72,18 +76,32 @@ class TestDestErrorHandling < Test::Unit::TestCase
# This fake runs a failed migration then a succeeding one, so we
# expect the destination to take control.
run_fake( "source/close_after_entrust_reply" )
assert_control
end
def test_cant_rebind_retries
run_fake( "source/successful_transfer" )
end
private
def run_fake( name )
@env.run_fake( name, @env.ip, @env.port1 )
@env.run_fake( name, @env.ip, @env.port1, @env.ip, @env.rebind_port1 )
assert @env.fake_reports_success, "#{name} failed."
end
def status
stat, _ = @env.status1
stat
end
def assert_no_control
status, stderr = @env.status1
assert !status['has_control'], "Thought it had control"
end
def assert_control
assert status['has_control'], "Didn't think it had control"
end
end # class TestDestErrorHandling

View File

@@ -80,6 +80,7 @@ class TestSourceErrorHandling < Test::Unit::TestCase
def test_post_entrust_disconnect_causes_retry
@env.nbd1.can_die(0)
run_fake( "dest/close_after_entrust" )
end