#include "client.h" #include "serve.h" #include "ioutil.h" #include "sockutil.h" #include "util.h" #include "bitset.h" #include "nbdtypes.h" #include "self_pipe.h" #include #include #include #include #include #include // When this signal is invoked, we call shutdown() on the client fd, which // results in the thread being wound up void client_killswitch_hit(int signal __attribute__ ((unused)), siginfo_t * info, void *ptr __attribute__ ((unused))) { int fd = info->si_value.sival_int; warn("Killswitch for fd %i activated, calling shutdown on socket", fd); FATAL_IF(-1 == shutdown(fd, SHUT_RDWR), SHOW_ERRNO ("Failed to shutdown() the socket, killing the server") ); } struct client *client_create(struct server *serve, int socket) { NULLCHECK(serve); struct client *c; struct sigevent evp = { .sigev_notify = SIGEV_SIGNAL, .sigev_signo = CLIENT_KILLSWITCH_SIGNAL }; /* * Our killswitch closes this socket, forcing read() and write() calls * blocked on it to return with an error. The thread then close()s the * socket itself, avoiding races. */ evp.sigev_value.sival_int = socket; c = xmalloc(sizeof(struct client)); c->stopped = 0; c->socket = socket; c->serve = serve; c->stop_signal = self_pipe_create(); FATAL_IF_NEGATIVE(timer_create (CLOCK_MONOTONIC, &evp, &(c->killswitch)), SHOW_ERRNO("Failed to create killswitch timer") ); debug("Alloced client %p with socket %d", c, socket); return c; } void client_signal_stop(struct client *c) { NULLCHECK(c); debug("client %p: signal stop (%d, %d)", c, c->stop_signal->read_fd, c->stop_signal->write_fd); self_pipe_signal(c->stop_signal); } void client_destroy(struct client *client) { NULLCHECK(client); FATAL_IF_NEGATIVE(timer_delete(client->killswitch), SHOW_ERRNO("Couldn't delete killswitch") ); debug("Destroying stop signal for client %p", client); self_pipe_destroy(client->stop_signal); debug("Freeing client %p", client); free(client); } /** * So waiting on client->socket is len bytes of data, and we must write it all * to client->mapped. However while doing do we must consult the bitmap * client->serve->allocation_map, which is a bitmap where one bit represents * block_allocation_resolution bytes. Where a bit isn't set, there are no * disc blocks allocated for that portion of the file, and we'd like to keep * it that way. * * If the bitmap shows that every block in our prospective write is already * allocated, we can proceed as normal and make one call to writeloop. * */ void write_not_zeroes(struct client *client, uint64_t from, uint64_t len) { NULLCHECK(client); NULLCHECK(client->serve); NULLCHECK(client->serve->allocation_map); struct bitset *map = client->serve->allocation_map; while (len > 0) { /* so we have to calculate how much of our input to consider * next based on the bitmap of allocated blocks. This will be * at a coarser resolution than the actual write, which may * not fall on a block boundary at either end. So we look up * how many blocks our write covers, then cut off the start * and end to get the exact number of bytes. */ uint64_t run = bitset_run_count(map, from, len); debug("write_not_zeroes: from=%ld, len=%d, run=%d", from, len, run); if (run > len) { run = len; debug("(run adjusted to %d)", run); } /* // Useful but expensive if (0) { uint64_t i; fprintf(stderr, "full map resolution=%d: ", map->resolution); for (i=0; iserve->size; i+=map->resolution) { int here = (from >= i && from < i+map->resolution); if (here) { fprintf(stderr, ">"); } fprintf(stderr, bitset_is_set_at(map, i) ? "1" : "0"); if (here) { fprintf(stderr, "<"); } } fprintf(stderr, "\n"); } */ #define DO_READ(dst, len) ERROR_IF_NEGATIVE( \ readloop( \ client->socket, \ (dst), \ (len) \ ), \ "read failed %ld+%d", from, (len) \ ) if (bitset_is_set_at(map, from)) { debug("writing the lot: from=%ld, run=%d", from, run); /* already allocated, just write it all */ DO_READ(client->mapped + from, run); /* We know from our earlier call to bitset_run_count that the * bitset is all-1s at this point, but we need to dirty it for the * sake of the event stream - the actual bytes have changed, and we * are interested in that fact. */ bitset_set_range(map, from, run); len -= run; from += run; } else { char zerobuffer[block_allocation_resolution]; /* not allocated, read in block_allocation_resoution */ while (run > 0) { uint64_t blockrun = block_allocation_resolution - (from % block_allocation_resolution); if (blockrun > run) blockrun = run; DO_READ(zerobuffer, blockrun); /* This reads the buffer twice in the worst case * but we're leaning on memcmp failing early * and memcpy being fast, rather than try to * hand-optimized something specific. */ int all_zeros = (zerobuffer[0] == 0) && (0 == memcmp(zerobuffer, zerobuffer + 1, blockrun - 1)); if (!all_zeros) { memcpy(client->mapped + from, zerobuffer, blockrun); bitset_set_range(map, from, blockrun); /* at this point we could choose to * short-cut the rest of the write for * faster I/O but by continuing to do it * the slow way we preserve as much * sparseness as possible. */ } /* When the block is all_zeroes, no bytes have changed, so we * don't need to put an event into the bitset stream. This may * be surprising in the future. */ len -= blockrun; run -= blockrun; from += blockrun; } } } } int fd_read_request(int fd, struct nbd_request_raw *out_request) { return readloop(fd, out_request, sizeof(struct nbd_request_raw)); } /* Returns 1 if *request was filled with a valid request which we should * try to honour. 0 otherwise. */ int client_read_request(struct client *client, struct nbd_request *out_request, int *disconnected) { NULLCHECK(client); NULLCHECK(out_request); struct nbd_request_raw request_raw; if (fd_read_request(client->socket, &request_raw) == -1) { *disconnected = 1; switch (errno) { case 0: warn("EOF while reading request"); return 0; case ECONNRESET: warn("Connection reset while" " reading request"); return 0; case ETIMEDOUT: warn("Connection timed out while" " reading request"); return 0; default: /* FIXME: I've seen this happen, but I * couldn't reproduce it so I'm leaving * it here with a better debug output in * the hope it'll spontaneously happen * again. It should *probably* be an * error() call, but I want to be sure. * */ fatal("Error reading request: %d, %s", errno, strerror(errno)); } } nbd_r2h_request(&request_raw, out_request); return 1; } int fd_write_reply(int fd, uint64_t handle, int error) { struct nbd_reply reply; struct nbd_reply_raw reply_raw; reply.magic = REPLY_MAGIC; reply.error = error; reply.handle.w = handle; nbd_h2r_reply(&reply, &reply_raw); debug("Replying with handle=0x%08X, error=%" PRIu32, handle, error); if (-1 == writeloop(fd, &reply_raw, sizeof(reply_raw))) { switch (errno) { case ECONNRESET: error("Connection reset while writing reply"); break; case EBADF: fatal("Tried to write to an invalid file descriptor"); break; case EPIPE: error("Remote end closed"); break; default: fatal("Unhandled error while writing: %d", errno); } } return 1; } /* Writes a reply to request *request, with error, to the client's * socket. * Returns 1; we don't check for errors on the write. * TODO: Check for errors on the write. */ int client_write_reply(struct client *client, struct nbd_request *request, int error) { return fd_write_reply(client->socket, request->handle.w, error); } void client_write_init(struct client *client, uint64_t size) { struct nbd_init init = { {0} }; struct nbd_init_raw init_raw = { {0} }; memcpy(init.passwd, INIT_PASSWD, sizeof(init.passwd)); init.magic = INIT_MAGIC; init.size = size; /* As more features are implemented, this is the place to advertise * them. */ init.flags = FLAG_HAS_FLAGS | FLAG_SEND_FLUSH | FLAG_SEND_FUA; memset(init.reserved, 0, 124); nbd_h2r_init(&init, &init_raw); ERROR_IF_NEGATIVE(writeloop (client->socket, &init_raw, sizeof(init_raw)), "Couldn't send hello"); } /* Remove len bytes from the client socket. This is needed when the * client sends a write we can't honour - we need to get rid of the * bytes they've already written before we can look for another request. */ void client_flush(struct client *client, size_t len) { int devnull = open("/dev/null", O_WRONLY); FATAL_IF_NEGATIVE(devnull, "Couldn't open /dev/null: %s", strerror(errno)); int pipes[2]; pipe(pipes); const unsigned int flags = SPLICE_F_MORE | SPLICE_F_MOVE; size_t spliced = 0; while (spliced < len) { ssize_t received = splice(client->socket, NULL, pipes[1], NULL, len - spliced, flags); FATAL_IF_NEGATIVE(received, "splice error: %s", strerror(errno)); ssize_t junked = 0; while (junked < received) { ssize_t junk; junk = splice(pipes[0], NULL, devnull, NULL, received, flags); FATAL_IF_NEGATIVE(junk, "splice error: %s", strerror(errno)); junked += junk; } spliced += received; } debug("Flushed %d bytes", len); close(devnull); } /* Check to see if the client's request needs a reply constructing. * Returns 1 if we do, 0 otherwise. * request_err is set to 0 if the client sent a bad request, in which * case we drop the connection. */ int client_request_needs_reply(struct client *client, struct nbd_request request) { /* The client is stupid, but don't take down the whole server as a result. * We send a reply before disconnecting so that at least some indication of * the problem is visible, and so proxies don't retry the same (bad) request * forever. */ if (request.magic != REQUEST_MAGIC) { warn("Bad magic 0x%08X from client", request.magic); client_write_reply(client, &request, EBADMSG); client->disconnect = 1; // no need to flush return 0; } debug("request type=%" PRIu16 ", flags=%" PRIu16 ", from=%" PRIu64 ", len=%" PRIu32 ", handle=0x%08X", request.type, request.flags, request.from, request.len, request.handle); /* check it's not out of range. NBD protocol requires ENOSPC to be * returned in this instance */ if (request.from + request.len > client->serve->size) { warn("write request %" PRIu64 "+%" PRIu32 " out of range", request.from, request.len); if (request.type == REQUEST_WRITE) { client_flush(client, request.len); } client_write_reply(client, &request, ENOSPC); client->disconnect = 0; return 0; } switch (request.type) { case REQUEST_READ: break; case REQUEST_WRITE: break; case REQUEST_DISCONNECT: debug("request disconnect"); client->disconnect = 1; return 0; case REQUEST_FLUSH: break; default: /* NBD prototcol says servers SHOULD return EINVAL to unknown * commands */ warn("Unknown request 0x%08X", request.type); client_write_reply(client, &request, EINVAL); client->disconnect = 0; return 0; } return 1; } void client_reply_to_read(struct client *client, struct nbd_request request) { off64_t offset; debug("request read %ld+%d", request.from, request.len); sock_set_tcp_cork(client->socket, 1); client_write_reply(client, &request, 0); offset = request.from; /* If we get cut off partway through this sendfile, we don't * want to kill the server. This should be an error. */ ERROR_IF_NEGATIVE(sendfileloop(client->socket, client->fileno, &offset, request.len), "sendfile failed from=%ld, len=%d", offset, request.len); sock_set_tcp_cork(client->socket, 0); } void client_reply_to_write(struct client *client, struct nbd_request request) { debug("request write from=%" PRIu64 ", len=%" PRIu32 ", handle=0x%08X", request.from, request.len, request.handle); if (client->serve->allocation_map_built) { write_not_zeroes(client, request.from, request.len); } else { debug("No allocation map, writing directly."); /* If we get cut off partway through reading this data: * */ ERROR_IF_NEGATIVE(readloop(client->socket, client->mapped + request.from, request.len), "reading write data failed from=%ld, len=%d", request.from, request.len); /* the allocation_map is shared between client threads, and may be * being built. We need to reflect the write in it, as it may be in * a position the builder has already gone over. */ bitset_set_range(client->serve->allocation_map, request.from, request.len); } // Only flush if FUA is set -- overridden for now to force flush after each // write. // if (request.flags & CMD_FLAG_FUA) { if (1) { /* multiple of page size */ uint64_t from_rounded = request.from & (~(sysconf(_SC_PAGE_SIZE) - 1)); uint64_t len_rounded = request.len + (request.from - from_rounded); debug("Calling msync from=%" PRIu64 ", len=%" PRIu64 "", from_rounded, len_rounded); FATAL_IF_NEGATIVE(msync(client->mapped + from_rounded, len_rounded, MS_SYNC | MS_INVALIDATE), "msync failed %ld %ld", request.from, request.len); } client_write_reply(client, &request, 0); } void client_reply_to_flush(struct client *client, struct nbd_request request) { debug("request flush from=%" PRIu64 ", len=%" PRIu32 ", handle=0x%08X", request.from, request.len, request.handle); ERROR_IF_NEGATIVE(msync (client->mapped, client->mapped_size, MS_SYNC | MS_INVALIDATE), "flush failed"); client_write_reply(client, &request, 0); } void client_reply(struct client *client, struct nbd_request request) { switch (request.type) { case REQUEST_READ: client_reply_to_read(client, request); break; case REQUEST_WRITE: client_reply_to_write(client, request); break; case REQUEST_FLUSH: client_reply_to_flush(client, request); break; } } /* Starts a timer that will kill the whole process if disarm is not called * within a timeout (see CLIENT_HANDLE_TIMEOUT). */ void client_arm_killswitch(struct client *client) { struct itimerspec its = { .it_value = {.tv_nsec = 0,.tv_sec = CLIENT_HANDLER_TIMEOUT}, .it_interval = {.tv_nsec = 0,.tv_sec = 0} }; if (!client->serve->use_killswitch) { return; } debug("Arming killswitch"); FATAL_IF_NEGATIVE(timer_settime(client->killswitch, 0, &its, NULL), SHOW_ERRNO("Failed to arm killswitch") ); return; } void client_disarm_killswitch(struct client *client) { struct itimerspec its = { .it_value = {.tv_nsec = 0,.tv_sec = 0}, .it_interval = {.tv_nsec = 0,.tv_sec = 0} }; if (!client->serve->use_killswitch) { return; } debug("Disarming killswitch"); FATAL_IF_NEGATIVE(timer_settime(client->killswitch, 0, &its, NULL), SHOW_ERRNO("Failed to disarm killswitch") ); return; } /* Returns 0 if we should continue trying to serve requests */ int client_serve_request(struct client *client) { struct nbd_request request = { 0 }; int stop = 1; int disconnected = 0; fd_set rfds, efds; int fd_count; /* wait until there are some bytes on the fd before committing to reads * FIXME: this whole scheme is broken because we're using blocking reads. * read() can block directly after a select anyway, and it's possible that, * without the killswitch, we'd hang forever. With the killswitch, we just * hang for "a while". The Right Thing to do is to rewrite client.c to be * non-blocking. */ FD_ZERO(&rfds); FD_SET(client->socket, &rfds); self_pipe_fd_set(client->stop_signal, &rfds); FD_ZERO(&efds); FD_SET(client->socket, &efds); fd_count = sock_try_select(FD_SETSIZE, &rfds, NULL, &efds, NULL); if (fd_count == 0) { /* This "can't ever happen" */ fatal("No FDs selected, and no timeout!"); } else if (fd_count < 0) { fatal("Select failed"); } if (self_pipe_fd_isset(client->stop_signal, &rfds)) { debug("Client received stop signal."); return 1; // Don't try to serve more requests } if (FD_ISSET(client->socket, &efds)) { debug("Client connection closed"); return 1; } /* We arm / disarm around the whole request cycle. The reason for this is * that the remote peer could uncleanly die at any point; if we're stuck on * a blocking read(), then that will hang for (almost) forever. This is bad * in general, makes the server respond only to kill -9, and breaks * outward mirroring in a most unpleasant way. * * Don't forget to disarm before exiting, no matter what! * * The replication is simple: open a connection to the flexnbd server, write * a single byte, and then wait. * */ client_arm_killswitch(client); if (!client_read_request(client, &request, &disconnected)) { client_disarm_killswitch(client); return stop; } if (disconnected) { client_disarm_killswitch(client); return stop; } if (!client_request_needs_reply(client, request)) { client_disarm_killswitch(client); return client->disconnect; } { if (!server_is_closed(client->serve)) { client_reply(client, request); stop = 0; } } client_disarm_killswitch(client); return stop; } void client_send_hello(struct client *client) { client_write_init(client, client->serve->size); } void client_cleanup(struct client *client, int fatal __attribute__ ((unused))) { info("client cleanup for client %p", client); /* If the thread hits an error, we need to ensure this is off */ client_disarm_killswitch(client); if (client->socket) { FATAL_IF_NEGATIVE(close(client->socket), "Error closing client socket %d", client->socket); debug("Closed client socket fd %d", client->socket); client->socket = -1; } if (client->mapped) { munmap(client->mapped, client->serve->size); } if (client->fileno) { FATAL_IF_NEGATIVE(close(client->fileno), "Error closing file %d", client->fileno); debug("Closed client file fd %d", client->fileno); client->fileno = -1; } if (server_acl_locked(client->serve)) { server_unlock_acl(client->serve); } } void *client_serve(void *client_uncast) { struct client *client = (struct client *) client_uncast; error_set_handler((cleanup_handler *) client_cleanup, client); info("client: mmaping file"); FATAL_IF_NEGATIVE(open_and_mmap(client->serve->filename, &client->fileno, &client->mapped_size, (void **) &client->mapped), "Couldn't open/mmap file %s: %s", client->serve->filename, strerror(errno) ); FATAL_IF_NEGATIVE(madvise (client->mapped, client->serve->size, MADV_RANDOM), SHOW_ERRNO("Failed to madvise() %s", client->serve->filename) ); debug("Opened client file fd %d", client->fileno); debug("client: sending hello"); client_send_hello(client); debug("client: serving requests"); while (client_serve_request(client) == 0); debug("client: stopped serving requests"); client->stopped = 1; if (client->disconnect) { debug("client: control arrived"); server_control_arrived(client->serve); } debug("Cleaning client %p up normally in thread %p", client, pthread_self()); client_cleanup(client, 0); debug("Client thread done"); return NULL; }