Retry failed rebind attempts
When we receive a migration, if rebinding to the new listen address and port fails for a reason which might be fixable, rather than killing the server we retry once a second. Also in this patch: non-overlapping log messages and a fix for the client going away halfway through a sendfile loop.
This commit is contained in:
@@ -5,7 +5,7 @@ require 'file_writer'
|
||||
|
||||
class Environment
|
||||
attr_reader( :blocksize, :filename1, :filename2, :ip,
|
||||
:port1, :port2, :nbd1, :nbd2, :file1, :file2 )
|
||||
:port1, :port2, :nbd1, :nbd2, :file1, :file2, :rebind_port1 )
|
||||
|
||||
def initialize
|
||||
@blocksize = 1024
|
||||
@@ -14,9 +14,11 @@ class Environment
|
||||
@ip = "127.0.0.1"
|
||||
@available_ports = [*40000..41000] - listening_ports
|
||||
@port1 = @available_ports.shift
|
||||
@rebind_port1 = @available_ports.shift
|
||||
@port2 = @available_ports.shift
|
||||
@nbd1 = FlexNBD.new("../../build/flexnbd", @ip, @port1)
|
||||
@nbd2 = FlexNBD.new("../../build/flexnbd", @ip, @port2)
|
||||
@rebind_port2 = @available_ports.shift
|
||||
@nbd1 = FlexNBD.new("../../build/flexnbd", @ip, @port1, @ip, @rebind_port1)
|
||||
@nbd2 = FlexNBD.new("../../build/flexnbd", @ip, @port2, @ip, @rebind_port2)
|
||||
|
||||
@fake_pid = nil
|
||||
end
|
||||
@@ -95,6 +97,7 @@ class Environment
|
||||
end
|
||||
|
||||
|
||||
@nbd1.can_die(0)
|
||||
@nbd1.kill
|
||||
@nbd2.kill
|
||||
|
||||
@@ -104,7 +107,7 @@ class Environment
|
||||
end
|
||||
|
||||
|
||||
def run_fake( name, addr, port )
|
||||
def run_fake( name, addr, port, rebind_addr = addr, rebind_port = port )
|
||||
fakedir = File.join( File.dirname( __FILE__ ), "fakes" )
|
||||
fake = Dir[File.join( fakedir, name ) + "*"].sort.find { |fn|
|
||||
File.executable?( fn )
|
||||
@@ -113,8 +116,11 @@ class Environment
|
||||
raise "no fake executable" unless fake
|
||||
raise "no addr" unless addr
|
||||
raise "no port" unless port
|
||||
raise "no rebind_addr" unless rebind_addr
|
||||
raise "no rebind_port" unless rebind_port
|
||||
|
||||
@fake_pid = fork do
|
||||
exec fake + " " + addr.to_s + " " + port.to_s + " " + @nbd1.pid.to_s
|
||||
exec [fake, addr, port, @nbd1.pid, rebind_addr, rebind_port].map{|x| x.to_s}.join(" ")
|
||||
end
|
||||
sleep(0.5)
|
||||
end
|
||||
|
@@ -24,5 +24,6 @@ client.close
|
||||
client2 = server.accept
|
||||
client2.receive_mirror
|
||||
|
||||
|
||||
exit(0)
|
||||
|
||||
|
@@ -9,7 +9,7 @@
|
||||
require 'flexnbd/fake_source'
|
||||
include FlexNBD
|
||||
|
||||
addr, port, srv_pid = *ARGV
|
||||
addr, port, srv_pid, rebind_addr, rebind_port = *ARGV
|
||||
|
||||
client = FakeSource.new( addr, port, "Timed out connecting" )
|
||||
client.read_hello
|
||||
@@ -25,8 +25,8 @@ sleep(0.25)
|
||||
client2 = FakeSource.new( addr, port, "Timed out reconnecting to mirror" )
|
||||
client2.send_mirror
|
||||
|
||||
sleep(0.25)
|
||||
client3 = FakeSource.new( addr, port, "Timed out reconnecting to read" )
|
||||
sleep(1)
|
||||
client3 = FakeSource.new( rebind_addr, rebind_port, "Timed out reconnecting to read" )
|
||||
client3.close
|
||||
|
||||
exit(0)
|
||||
|
17
tests/acceptance/fakes/source/close_mid_read.rb
Executable file
17
tests/acceptance/fakes/source/close_mid_read.rb
Executable file
@@ -0,0 +1,17 @@
|
||||
#!/usr/bin/env ruby
|
||||
|
||||
# Connect, but get the protocol wrong: don't read the hello, so we
|
||||
# close and break the sendfile.
|
||||
|
||||
require 'flexnbd/fake_source'
|
||||
include FlexNBD
|
||||
|
||||
addr, port, srv_pid, newaddr, newport = *ARGV
|
||||
|
||||
client = FakeSource.new( addr, port, "Timed out connecting" )
|
||||
client.write_read_request( 0, 8 )
|
||||
client.read_raw( 4 )
|
||||
client.close
|
||||
|
||||
|
||||
exit(0)
|
29
tests/acceptance/fakes/source/successful_transfer.rb
Executable file
29
tests/acceptance/fakes/source/successful_transfer.rb
Executable file
@@ -0,0 +1,29 @@
|
||||
#!/usr/bin/env ruby
|
||||
|
||||
# Successfully send a migration, but squat on the IP and port which
|
||||
# the destination wants to rebind to. The destination should retry
|
||||
# every second, so we give it up then attempt to connect to the new
|
||||
# server.
|
||||
|
||||
require 'flexnbd/fake_source'
|
||||
include FlexNBD
|
||||
|
||||
addr, port, srv_pid, newaddr, newport = *ARGV
|
||||
|
||||
squatter = TCPServer.open( newaddr, newport.to_i )
|
||||
|
||||
client = FakeSource.new( addr, port, "Timed out connecting" )
|
||||
client.send_mirror()
|
||||
|
||||
sleep(1)
|
||||
|
||||
squatter.close()
|
||||
|
||||
sleep(1)
|
||||
|
||||
client2 = FakeSource.new( newaddr, newport.to_i, "Timed out reconnecting" )
|
||||
client2.read_hello
|
||||
client2.read( 0, 8 )
|
||||
client2.close
|
||||
|
||||
exit( 0 )
|
@@ -166,7 +166,7 @@ end # class ValgrindExecutor
|
||||
# Noddy test class to exercise FlexNBD from the outside for testing.
|
||||
#
|
||||
class FlexNBD
|
||||
attr_reader :bin, :ctrl, :pid, :ip, :port
|
||||
attr_reader :bin, :ctrl, :pid, :ip, :port, :rebind_ip, :rebind_port
|
||||
|
||||
class << self
|
||||
def counter
|
||||
@@ -187,7 +187,7 @@ class FlexNBD
|
||||
end
|
||||
|
||||
|
||||
def initialize(bin, ip, port)
|
||||
def initialize(bin, ip, port, rebind_ip = ip, rebind_port = port)
|
||||
@bin = bin
|
||||
@debug = (ENV['DEBUG'] && `#{@bin} serve --help` =~ /--verbose/) ? "--verbose" : ""
|
||||
raise "#{bin} not executable" unless File.executable?(bin)
|
||||
@@ -195,6 +195,8 @@ class FlexNBD
|
||||
@ctrl = "/tmp/.flexnbd.ctrl.#{Time.now.to_i}.#{rand}"
|
||||
@ip = ip
|
||||
@port = port
|
||||
@rebind_ip = rebind_ip
|
||||
@rebind_port = rebind_port
|
||||
@kill = []
|
||||
end
|
||||
|
||||
@@ -224,6 +226,8 @@ class FlexNBD
|
||||
"--addr #{ip} "\
|
||||
"--port #{port} "\
|
||||
"--file #{file} "\
|
||||
"--rebind-addr #{rebind_ip} " \
|
||||
"--rebind-port #{rebind_port} " \
|
||||
"--sock #{ctrl} "\
|
||||
"#{@debug} "\
|
||||
"#{acl.join(' ')}"
|
||||
|
@@ -54,12 +54,28 @@ module FlexNBD
|
||||
send_request( 2, handle )
|
||||
end
|
||||
|
||||
def write_read_request( from, len, handle="myhandle" )
|
||||
send_request( 0, "myhandle", from, len )
|
||||
end
|
||||
|
||||
|
||||
def write_data( data )
|
||||
@sock.write( data )
|
||||
end
|
||||
|
||||
|
||||
# Handy utility
|
||||
def read( from, len )
|
||||
timing_out( 2, "Timed out reading" ) do
|
||||
send_request( 0, "myhandle", from, len )
|
||||
read_raw( len )
|
||||
end
|
||||
end
|
||||
|
||||
def read_raw( len )
|
||||
@sock.read( len )
|
||||
end
|
||||
|
||||
def send_mirror
|
||||
read_hello()
|
||||
write_write_request( 0, 8 )
|
||||
|
@@ -34,6 +34,10 @@ class TestDestErrorHandling < Test::Unit::TestCase
|
||||
end
|
||||
|
||||
|
||||
def test_partial_read_causes_error
|
||||
run_fake( "source/close_mid_read" )
|
||||
end
|
||||
|
||||
def test_double_connect_during_hello
|
||||
run_fake( "source/connect_during_hello" )
|
||||
end
|
||||
@@ -72,18 +76,32 @@ class TestDestErrorHandling < Test::Unit::TestCase
|
||||
# This fake runs a failed migration then a succeeding one, so we
|
||||
# expect the destination to take control.
|
||||
run_fake( "source/close_after_entrust_reply" )
|
||||
assert_control
|
||||
end
|
||||
|
||||
|
||||
def test_cant_rebind_retries
|
||||
run_fake( "source/successful_transfer" )
|
||||
end
|
||||
|
||||
|
||||
private
|
||||
def run_fake( name )
|
||||
@env.run_fake( name, @env.ip, @env.port1 )
|
||||
@env.run_fake( name, @env.ip, @env.port1, @env.ip, @env.rebind_port1 )
|
||||
assert @env.fake_reports_success, "#{name} failed."
|
||||
end
|
||||
|
||||
def status
|
||||
stat, _ = @env.status1
|
||||
stat
|
||||
end
|
||||
|
||||
def assert_no_control
|
||||
status, stderr = @env.status1
|
||||
assert !status['has_control'], "Thought it had control"
|
||||
end
|
||||
|
||||
def assert_control
|
||||
assert status['has_control'], "Didn't think it had control"
|
||||
end
|
||||
|
||||
end # class TestDestErrorHandling
|
||||
|
@@ -80,6 +80,7 @@ class TestSourceErrorHandling < Test::Unit::TestCase
|
||||
|
||||
|
||||
def test_post_entrust_disconnect_causes_retry
|
||||
@env.nbd1.can_die(0)
|
||||
run_fake( "dest/close_after_entrust" )
|
||||
end
|
||||
|
||||
|
Reference in New Issue
Block a user