Retry failed rebind attempts

When we receive a migration, if rebinding to the new listen address and
port fails for a reason which might be fixable, rather than killing the
server we retry once a second.  Also in this patch: non-overlapping log
messages and a fix for the client going away halfway through a sendfile
loop.
This commit is contained in:
Alex Young
2012-07-12 14:14:46 +01:00
parent 9002341e77
commit 10b46beeea
12 changed files with 194 additions and 20 deletions

View File

@@ -366,7 +366,11 @@ void client_reply_to_read( struct client* client, struct nbd_request request )
client_write_reply( client, &request, 0);
offset = request.from;
FATAL_IF_NEGATIVE(
/* If we get cut off partway through this sendfile, we don't
* want to kill the server. This should be an error.
*/
ERROR_IF_NEGATIVE(
sendfileloop(
client->socket,
client->fileno,

View File

@@ -192,6 +192,62 @@ int server_port( struct server * server )
}
/* Try to bind to our serving socket, retrying until it works or gives a
* fatal error. */
void serve_bind( struct server * serve )
{
int bind_result;
char s_address[64];
memset( s_address, 0, 64 );
strcpy( s_address, "???" );
inet_ntop( serve->bind_to.generic.sa_family,
sockaddr_address_data( &serve->bind_to.generic),
s_address, 64 );
do {
bind_result = bind(
serve->server_fd,
&serve->bind_to.generic,
sizeof(serve->bind_to));
if ( 0 == bind_result ) {
info( "Bound to %s port %d",
s_address,
ntohs(serve->bind_to.v4.sin_port));
break;
}
else {
warn( "Couldn't bind to %s port %d: %s",
s_address,
ntohs(serve->bind_to.v4.sin_port),
strerror( errno ) );
switch (errno){
/* bind() can give us EACCES,
* EADDRINUSE, EADDRNOTAVAIL, EBADF,
* EINVAL or ENOTSOCK.
*
* Any of these other than EACCES,
* EADDRINUSE or EADDRNOTAVAIL signify
* that there's a logic error somewhere.
*/
case EACCES:
case EADDRINUSE:
case EADDRNOTAVAIL:
debug("retrying");
sleep(1);
continue;
default:
fatal( "Giving up" );
}
}
} while ( 1 );
}
/** Prepares a listening socket for the NBD server, binding etc. */
void serve_open_server_socket(struct server* params)
{
@@ -205,21 +261,32 @@ void serve_open_server_socket(struct server* params)
FATAL_IF_NEGATIVE(params->server_fd,
"Couldn't create server socket");
/* We need SO_REUSEADDR so that when we switch from listening to
* serving we don't have to change address if we don't want to.
*
* If this fails, it's not necessarily bad in principle, but at
* this point in the code we can't tell if it's going to be a
* problem. It's also indicative of something odd going on, so
* we barf.
*/
FATAL_IF_NEGATIVE(
setsockopt(params->server_fd, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof(optval)),
"Couldn't set SO_REUSEADDR"
);
/* TCP_NODELAY makes everything not be slow. If we can't set
* this, again, there's something odd going on which we don't
* understand.
*/
FATAL_IF_NEGATIVE(
setsockopt(params->server_fd, IPPROTO_TCP, TCP_NODELAY, &optval, sizeof(optval)),
"Couldn't set TCP_NODELAY"
);
FATAL_IF_NEGATIVE(
bind(params->server_fd, &params->bind_to.generic,
sizeof(params->bind_to)),
"Couldn't bind server to IP address"
);
/* If we can't bind, presumably that's because someone else is
* squatting on our ip/port combo, or the ip isn't yet
* configured. Ideally we want to retry this. */
serve_bind(params);
FATAL_IF_NEGATIVE(
listen(params->server_fd, params->tcp_backlog),

View File

@@ -45,12 +45,23 @@ void mylog(int line_level, const char* format, ...)
{
va_list argptr;
if (line_level < log_level) { return; }
/* Copy the format sideways so that we can append a "\n" to it
* and avoid a second fprintf. This stops log lines from getting
* interleaved.
*/
int format_len = strlen( format );
char *format_n = xmalloc( format_len + 2 );
memcpy( format_n, format, format_len );
format_n[format_len] = '\n';
va_start(argptr, format);
vfprintf(stderr, format, argptr);
vfprintf(stderr, format_n, argptr);
va_end(argptr);
fprintf(stderr, "\n");
free( format_n );
}
void* xrealloc(void* ptr, size_t size)

View File

@@ -5,7 +5,7 @@ require 'file_writer'
class Environment
attr_reader( :blocksize, :filename1, :filename2, :ip,
:port1, :port2, :nbd1, :nbd2, :file1, :file2 )
:port1, :port2, :nbd1, :nbd2, :file1, :file2, :rebind_port1 )
def initialize
@blocksize = 1024
@@ -14,9 +14,11 @@ class Environment
@ip = "127.0.0.1"
@available_ports = [*40000..41000] - listening_ports
@port1 = @available_ports.shift
@rebind_port1 = @available_ports.shift
@port2 = @available_ports.shift
@nbd1 = FlexNBD.new("../../build/flexnbd", @ip, @port1)
@nbd2 = FlexNBD.new("../../build/flexnbd", @ip, @port2)
@rebind_port2 = @available_ports.shift
@nbd1 = FlexNBD.new("../../build/flexnbd", @ip, @port1, @ip, @rebind_port1)
@nbd2 = FlexNBD.new("../../build/flexnbd", @ip, @port2, @ip, @rebind_port2)
@fake_pid = nil
end
@@ -95,6 +97,7 @@ class Environment
end
@nbd1.can_die(0)
@nbd1.kill
@nbd2.kill
@@ -104,7 +107,7 @@ class Environment
end
def run_fake( name, addr, port )
def run_fake( name, addr, port, rebind_addr = addr, rebind_port = port )
fakedir = File.join( File.dirname( __FILE__ ), "fakes" )
fake = Dir[File.join( fakedir, name ) + "*"].sort.find { |fn|
File.executable?( fn )
@@ -113,8 +116,11 @@ class Environment
raise "no fake executable" unless fake
raise "no addr" unless addr
raise "no port" unless port
raise "no rebind_addr" unless rebind_addr
raise "no rebind_port" unless rebind_port
@fake_pid = fork do
exec fake + " " + addr.to_s + " " + port.to_s + " " + @nbd1.pid.to_s
exec [fake, addr, port, @nbd1.pid, rebind_addr, rebind_port].map{|x| x.to_s}.join(" ")
end
sleep(0.5)
end

View File

@@ -24,5 +24,6 @@ client.close
client2 = server.accept
client2.receive_mirror
exit(0)

View File

@@ -9,7 +9,7 @@
require 'flexnbd/fake_source'
include FlexNBD
addr, port, srv_pid = *ARGV
addr, port, srv_pid, rebind_addr, rebind_port = *ARGV
client = FakeSource.new( addr, port, "Timed out connecting" )
client.read_hello
@@ -25,8 +25,8 @@ sleep(0.25)
client2 = FakeSource.new( addr, port, "Timed out reconnecting to mirror" )
client2.send_mirror
sleep(0.25)
client3 = FakeSource.new( addr, port, "Timed out reconnecting to read" )
sleep(1)
client3 = FakeSource.new( rebind_addr, rebind_port, "Timed out reconnecting to read" )
client3.close
exit(0)

View File

@@ -0,0 +1,17 @@
#!/usr/bin/env ruby
# Connect, but get the protocol wrong: don't read the hello, so we
# close and break the sendfile.
require 'flexnbd/fake_source'
include FlexNBD
addr, port, srv_pid, newaddr, newport = *ARGV
client = FakeSource.new( addr, port, "Timed out connecting" )
client.write_read_request( 0, 8 )
client.read_raw( 4 )
client.close
exit(0)

View File

@@ -0,0 +1,29 @@
#!/usr/bin/env ruby
# Successfully send a migration, but squat on the IP and port which
# the destination wants to rebind to. The destination should retry
# every second, so we give it up then attempt to connect to the new
# server.
require 'flexnbd/fake_source'
include FlexNBD
addr, port, srv_pid, newaddr, newport = *ARGV
squatter = TCPServer.open( newaddr, newport.to_i )
client = FakeSource.new( addr, port, "Timed out connecting" )
client.send_mirror()
sleep(1)
squatter.close()
sleep(1)
client2 = FakeSource.new( newaddr, newport.to_i, "Timed out reconnecting" )
client2.read_hello
client2.read( 0, 8 )
client2.close
exit( 0 )

View File

@@ -166,7 +166,7 @@ end # class ValgrindExecutor
# Noddy test class to exercise FlexNBD from the outside for testing.
#
class FlexNBD
attr_reader :bin, :ctrl, :pid, :ip, :port
attr_reader :bin, :ctrl, :pid, :ip, :port, :rebind_ip, :rebind_port
class << self
def counter
@@ -187,7 +187,7 @@ class FlexNBD
end
def initialize(bin, ip, port)
def initialize(bin, ip, port, rebind_ip = ip, rebind_port = port)
@bin = bin
@debug = (ENV['DEBUG'] && `#{@bin} serve --help` =~ /--verbose/) ? "--verbose" : ""
raise "#{bin} not executable" unless File.executable?(bin)
@@ -195,6 +195,8 @@ class FlexNBD
@ctrl = "/tmp/.flexnbd.ctrl.#{Time.now.to_i}.#{rand}"
@ip = ip
@port = port
@rebind_ip = rebind_ip
@rebind_port = rebind_port
@kill = []
end
@@ -224,6 +226,8 @@ class FlexNBD
"--addr #{ip} "\
"--port #{port} "\
"--file #{file} "\
"--rebind-addr #{rebind_ip} " \
"--rebind-port #{rebind_port} " \
"--sock #{ctrl} "\
"#{@debug} "\
"#{acl.join(' ')}"

View File

@@ -54,12 +54,28 @@ module FlexNBD
send_request( 2, handle )
end
def write_read_request( from, len, handle="myhandle" )
send_request( 0, "myhandle", from, len )
end
def write_data( data )
@sock.write( data )
end
# Handy utility
def read( from, len )
timing_out( 2, "Timed out reading" ) do
send_request( 0, "myhandle", from, len )
read_raw( len )
end
end
def read_raw( len )
@sock.read( len )
end
def send_mirror
read_hello()
write_write_request( 0, 8 )

View File

@@ -34,6 +34,10 @@ class TestDestErrorHandling < Test::Unit::TestCase
end
def test_partial_read_causes_error
run_fake( "source/close_mid_read" )
end
def test_double_connect_during_hello
run_fake( "source/connect_during_hello" )
end
@@ -72,18 +76,32 @@ class TestDestErrorHandling < Test::Unit::TestCase
# This fake runs a failed migration then a succeeding one, so we
# expect the destination to take control.
run_fake( "source/close_after_entrust_reply" )
assert_control
end
def test_cant_rebind_retries
run_fake( "source/successful_transfer" )
end
private
def run_fake( name )
@env.run_fake( name, @env.ip, @env.port1 )
@env.run_fake( name, @env.ip, @env.port1, @env.ip, @env.rebind_port1 )
assert @env.fake_reports_success, "#{name} failed."
end
def status
stat, _ = @env.status1
stat
end
def assert_no_control
status, stderr = @env.status1
assert !status['has_control'], "Thought it had control"
end
def assert_control
assert status['has_control'], "Didn't think it had control"
end
end # class TestDestErrorHandling

View File

@@ -80,6 +80,7 @@ class TestSourceErrorHandling < Test::Unit::TestCase
def test_post_entrust_disconnect_causes_retry
@env.nbd1.can_die(0)
run_fake( "dest/close_after_entrust" )
end