diff -urN linux-2.6.14.3-vanilla/arch/um/drivers/ubd_kern.c linux-2.6.14.3-20051130-newubd/arch/um/drivers/ubd_kern.c --- linux-2.6.14.3-vanilla/arch/um/drivers/ubd_kern.c 2005-11-24 22:10:21.000000000 +0000 +++ linux-2.6.14.3-20051130-newubd/arch/um/drivers/ubd_kern.c 2005-11-30 00:30:33.000000000 +0000 @@ -56,20 +56,6 @@ enum ubd_req { UBD_READ, UBD_WRITE }; -struct io_thread_req { - enum ubd_req op; - int fds[2]; - unsigned long offsets[2]; - unsigned long long offset; - unsigned long length; - char *buffer; - int sectorsize; - unsigned long sector_mask; - unsigned long long cow_offset; - unsigned long bitmap_words[2]; - int error; -}; - extern int open_ubd_file(char *file, struct openflags *openflags, char **backing_file_out, int *bitmap_offset_out, unsigned long *bitmap_len_out, int *data_offset_out, @@ -80,29 +66,7 @@ unsigned long *bitmap_len_out, int *data_offset_out); extern int read_cow_bitmap(int fd, void *buf, int offset, int len); -extern void do_io(struct io_thread_req *req); - -static inline int ubd_test_bit(__u64 bit, unsigned char *data) -{ - __u64 n; - int bits, off; - - bits = sizeof(data[0]) * 8; - n = bit / bits; - off = bit % bits; - return((data[n] & (1 << off)) != 0); -} - -static inline void ubd_set_bit(__u64 bit, unsigned char *data) -{ - __u64 n; - int bits, off; - bits = sizeof(data[0]) * 8; - n = bit / bits; - off = bit % bits; - data[n] |= (1 << off); -} /*End stuff from ubd_user.h*/ #define DRIVER_NAME "uml-blkdev" @@ -110,8 +74,6 @@ static DEFINE_SPINLOCK(ubd_io_lock); static DEFINE_SPINLOCK(ubd_lock); -static void (*do_ubd)(void); - static int ubd_open(struct inode * inode, struct file * filp); static int ubd_release(struct inode * inode, struct file * file); static int ubd_ioctl(struct inode * inode, struct file * file, @@ -120,10 +82,10 @@ #define MAX_DEV (8) static struct block_device_operations ubd_blops = { - .owner = THIS_MODULE, - .open = ubd_open, - .release = ubd_release, - .ioctl = ubd_ioctl, + .owner = THIS_MODULE, + .open = ubd_open, + .release = ubd_release, + .ioctl = ubd_ioctl, }; /* Protected by the queue_lock */ @@ -143,6 +105,21 @@ .cl = 1 }) #endif +/* max number of scatter/gather segments */ +#define MAX_SG 32 + +struct io_thread_request { + struct request *req; + int nsegs; + struct scatterlist sg[MAX_SG]; +}; + +struct io_thread_reply { + struct request *req; + int error; + unsigned long sectors; +}; + /* Not protected - changed only in ubd_setup_common and then only to * to enable O_SYNC. */ @@ -155,9 +132,10 @@ unsigned long *bitmap; unsigned long bitmap_len; int bitmap_offset; - int data_offset; + int data_offset; }; + struct ubd { char *file; int count; @@ -172,10 +150,10 @@ #define DEFAULT_COW { \ .file = NULL, \ - .fd = -1, \ - .bitmap = NULL, \ + .fd = -1, \ + .bitmap = NULL, \ .bitmap_offset = 0, \ - .data_offset = 0, \ + .data_offset = 0, \ } #define DEFAULT_UBD { \ @@ -185,8 +163,8 @@ .size = -1, \ .boot_openflags = OPEN_FLAGS, \ .openflags = OPEN_FLAGS, \ - .no_cow = 0, \ - .cow = DEFAULT_COW, \ + .no_cow = 0, \ + .cow = DEFAULT_COW, \ } struct ubd ubd_dev[MAX_DEV] = { [ 0 ... MAX_DEV - 1 ] = DEFAULT_UBD }; @@ -369,9 +347,9 @@ str++; } - if (*str == '=') + if (*str == '=') printk(KERN_ERR "ubd_setup : Too many flags specified\n"); - else + else printk(KERN_ERR "ubd_setup : Expected '='\n"); goto out; @@ -469,53 +447,35 @@ */ int intr_count = 0; -/* call ubd_finish if you need to serialize */ -static void __ubd_finish(struct request *req, int error) -{ - int nsect; - - if(error){ - end_request(req, 0); - return; - } - nsect = req->current_nr_sectors; - req->sector += nsect; - req->buffer += nsect << 9; - req->errors = 0; - req->nr_sectors -= nsect; - req->current_nr_sectors = 0; - end_request(req, 1); -} - -static inline void ubd_finish(struct request *req, int error) -{ - spin_lock(&ubd_io_lock); - __ubd_finish(req, error); - spin_unlock(&ubd_io_lock); -} - /* Called without ubd_io_lock held */ static void ubd_handler(void) { - struct io_thread_req req; - struct request *rq = elv_next_request(ubd_queue); + struct io_thread_reply R; int n; - do_ubd = NULL; intr_count++; - n = os_read_file(thread_fd, &req, sizeof(req)); - if(n != sizeof(req)){ - printk(KERN_ERR "Pid %d - spurious interrupt in ubd_handler, " - "err = %d\n", os_getpid(), -n); + + while ((n = os_read_file(thread_fd, &R, sizeof R)) == sizeof R) { spin_lock(&ubd_io_lock); - end_request(rq, 0); + if (!end_that_request_first(R.req, !R.error, R.sectors)) { + if (!R.error) add_disk_randomness(R.req->rq_disk); + end_that_request_last(R.req); + } else { + printk("request %p incomplete (did %lu of %lu " + "sectors)\n", + R.req, R.sectors, R.req->nr_sectors); + elv_requeue_request(ubd_queue, R.req); + } spin_unlock(&ubd_io_lock); - return; } - - ubd_finish(rq, req.error); - reactivate_fd(thread_fd, UBD_IRQ); + + if (n != -EAGAIN) + printk("bad return %d from os_read_file in ubd_handler\n", n); + reactivate_fd(thread_fd, UBD_IRQ); + + spin_lock(&ubd_io_lock); do_ubd_request(ubd_queue); + spin_unlock(&ubd_io_lock); } static irqreturn_t ubd_intr(int irq, void *dev, struct pt_regs *unused) @@ -529,8 +489,7 @@ void kill_io_thread(void) { - if(io_pid != -1) - os_kill_process(io_pid, 1); + os_kill_process(io_pid, 1); } __uml_exitcall(kill_io_thread); @@ -554,7 +513,7 @@ dev->cow.bitmap = NULL; } -static int ubd_open_dev(struct ubd *dev) +int ubd_open_dev(struct ubd *dev) { struct openflags flags; char **back_ptr; @@ -580,6 +539,7 @@ } } + if(dev->fd < 0){ printk("Failed to open '%s', errno = %d\n", dev->file, -dev->fd); @@ -827,20 +787,56 @@ .bus = &platform_bus_type, }; + int ubd_init(void) { - int i; + int i, err; + unsigned long stack; devfs_mk_dir("ubd"); if (register_blkdev(MAJOR_NR, "ubd")) return -1; - ubd_queue = blk_init_queue(do_ubd_request, &ubd_io_lock); - if (!ubd_queue) { - unregister_blkdev(MAJOR_NR, "ubd"); + if (!(ubd_queue = blk_init_queue(do_ubd_request, &ubd_io_lock))) { + printk(KERN_ERR "ubd: can't create queue; aborting"); return -1; } + printk("ubd: initialised queue\n"); + + /* Advertise ability to do barrier requests, and max request size, + * so we don't have to do memory management in the IO thread. */ + spin_lock(&ubd_io_lock); + blk_queue_ordered(ubd_queue, QUEUE_ORDERED_TAG); + blk_queue_max_hw_segments(ubd_queue, MAX_SG); + blk_queue_max_segment_size(ubd_queue, 4 * 1024 * 1024); + blk_queue_max_sectors(ubd_queue, 8 * 1024); + spin_unlock(&ubd_io_lock); + + printk("ubd: starting IO thread:"); + + stack = alloc_stack(0, 0); + if (!stack) printk("no new stack!\n"); + + io_pid = start_io_thread(stack + PAGE_SIZE - sizeof(void *)); + if (io_pid < 0) + printk(KERN_ERR "ubd: Failed to start I/O thread " + "(errno = %d)\n", -io_pid); + else + printk(" %d", io_pid); + printk("\n"); + + os_set_fd_block(thread_fd, 0); + + printk("ubd: requesting IRQ for fd %d... ", thread_fd); + err = um_request_irq(UBD_IRQ, thread_fd, IRQ_READ, ubd_intr, + SA_INTERRUPT, "ubd", ubd_dev); + if(err != 0) + printk(KERN_ERR "um_request_irq failed - errno = %d\n", -err); + else + printk("OK\n"); + + if (fake_major != MAJOR_NR) { char name[sizeof("ubd_nnn\0")]; @@ -858,8 +854,7 @@ late_initcall(ubd_init); int ubd_driver_init(void){ - unsigned long stack; - int err; + int err = 0; /* Set by CONFIG_BLK_DEV_UBD_SYNC or ubd=sync.*/ if(global_openflags.s){ @@ -867,20 +862,6 @@ /* Letting ubd=sync be like using ubd#s= instead of ubd#= is * enough. So use anyway the io thread. */ } - stack = alloc_stack(0, 0); - io_pid = start_io_thread(stack + PAGE_SIZE - sizeof(void *), - &thread_fd); - if(io_pid < 0){ - printk(KERN_ERR - "ubd : Failed to start I/O thread (errno = %d) - " - "falling back to synchronous I/O\n", -io_pid); - io_pid = -1; - return(0); - } - err = um_request_irq(UBD_IRQ, thread_fd, IRQ_READ, ubd_intr, - SA_INTERRUPT, "ubd", ubd_dev); - if(err != 0) - printk(KERN_ERR "um_request_irq failed - errno = %d\n", -err); return(err); } @@ -903,12 +884,6 @@ dev->count++; set_disk_ro(disk, !dev->openflags.w); - /* This should no more be needed. And it didn't work anyway to exclude - * read-write remounting of filesystems.*/ - /*if((filp->f_mode & FMODE_WRITE) && !dev->openflags.w){ - if(--dev->count == 0) ubd_close(dev); - err = -EROFS; - }*/ out: return(err); } @@ -923,135 +898,24 @@ return(0); } -static void cowify_bitmap(__u64 io_offset, int length, unsigned long *cow_mask, - __u64 *cow_offset, unsigned long *bitmap, - __u64 bitmap_offset, unsigned long *bitmap_words, - __u64 bitmap_len) -{ - __u64 sector = io_offset >> 9; - int i, update_bitmap = 0; - - for(i = 0; i < length >> 9; i++){ - if(cow_mask != NULL) - ubd_set_bit(i, (unsigned char *) cow_mask); - if(ubd_test_bit(sector + i, (unsigned char *) bitmap)) - continue; - - update_bitmap = 1; - ubd_set_bit(sector + i, (unsigned char *) bitmap); - } - - if(!update_bitmap) - return; - - *cow_offset = sector / (sizeof(unsigned long) * 8); - - /* This takes care of the case where we're exactly at the end of the - * device, and *cow_offset + 1 is off the end. So, just back it up - * by one word. Thanks to Lynn Kerby for the fix and James McMechan - * for the original diagnosis. - */ - if(*cow_offset == ((bitmap_len + sizeof(unsigned long) - 1) / - sizeof(unsigned long) - 1)) - (*cow_offset)--; - - bitmap_words[0] = bitmap[*cow_offset]; - bitmap_words[1] = bitmap[*cow_offset + 1]; - - *cow_offset *= sizeof(unsigned long); - *cow_offset += bitmap_offset; -} - -static void cowify_req(struct io_thread_req *req, unsigned long *bitmap, - __u64 bitmap_offset, __u64 bitmap_len) -{ - __u64 sector = req->offset >> 9; - int i; - - if(req->length > (sizeof(req->sector_mask) * 8) << 9) - panic("Operation too long"); - - if(req->op == UBD_READ) { - for(i = 0; i < req->length >> 9; i++){ - if(ubd_test_bit(sector + i, (unsigned char *) bitmap)) - ubd_set_bit(i, (unsigned char *) - &req->sector_mask); - } - } - else cowify_bitmap(req->offset, req->length, &req->sector_mask, - &req->cow_offset, bitmap, bitmap_offset, - req->bitmap_words, bitmap_len); -} - -/* Called with ubd_io_lock held */ -static int prepare_request(struct request *req, struct io_thread_req *io_req) -{ - struct gendisk *disk = req->rq_disk; - struct ubd *dev = disk->private_data; - __u64 offset; - int len; - - if(req->rq_status == RQ_INACTIVE) return(1); - - /* This should be impossible now */ - if((rq_data_dir(req) == WRITE) && !dev->openflags.w){ - printk("Write attempted on readonly ubd device %s\n", - disk->disk_name); - end_request(req, 0); - return(1); - } - - offset = ((__u64) req->sector) << 9; - len = req->current_nr_sectors << 9; - - io_req->fds[0] = (dev->cow.file != NULL) ? dev->cow.fd : dev->fd; - io_req->fds[1] = dev->fd; - io_req->cow_offset = -1; - io_req->offset = offset; - io_req->length = len; - io_req->error = 0; - io_req->sector_mask = 0; - - io_req->op = (rq_data_dir(req) == READ) ? UBD_READ : UBD_WRITE; - io_req->offsets[0] = 0; - io_req->offsets[1] = dev->cow.data_offset; - io_req->buffer = req->buffer; - io_req->sectorsize = 1 << 9; - - if(dev->cow.file != NULL) - cowify_req(io_req, dev->cow.bitmap, dev->cow.bitmap_offset, - dev->cow.bitmap_len); - - return(0); -} - /* Called with ubd_io_lock held */ static void do_ubd_request(request_queue_t *q) { - struct io_thread_req io_req; struct request *req; - int err, n; - - if(thread_fd == -1){ - while((req = elv_next_request(q)) != NULL){ - err = prepare_request(req, &io_req); - if(!err){ - do_io(&io_req); - __ubd_finish(req, io_req.error); - } - } - } - else { - if(do_ubd || (req = elv_next_request(q)) == NULL) - return; - err = prepare_request(req, &io_req); - if(!err){ - do_ubd = ubd_handler; - n = os_write_file(thread_fd, (char *) &io_req, - sizeof(io_req)); - if(n != sizeof(io_req)) - printk("write to io thread failed, " - "errno = %d\n", -n); + while ((req = elv_next_request(ubd_queue))) { + struct io_thread_request r; + int n; + + r.req = req; + r.nsegs = blk_rq_map_sg(ubd_queue, req, r.sg); + if (r.nsegs > MAX_SG) + printk("do_ubd_request: blk_rq_map_sg gave us %d " + "segments, should be <= %d\n", r.nsegs, MAX_SG); + if ((n = os_write_file(thread_fd, &r, sizeof r)) == sizeof r) + blkdev_dequeue_request(req); + else { + if (n != -EAGAIN) printk("do_ubd_request: os_write_file returned %d, should be %d\n", n, sizeof r); + break; } } } @@ -1068,7 +932,7 @@ }; switch (cmd) { - struct hd_geometry g; + struct hd_geometry g; struct cdrom_volctrl volume; case HDIO_GETGEO: if(!loc) return(-EINVAL); @@ -1186,18 +1050,17 @@ __u32 version, align; char *backing_file; int fd, err, sectorsize, same, mode = 0644; - fd = os_open_file(file, *openflags, mode); if(fd < 0){ if((fd == -ENOENT) && (create_cow_out != NULL)) *create_cow_out = 1; - if(!openflags->w || - ((fd != -EROFS) && (fd != -EACCES))) return(fd); + if(!openflags->w || + ((fd != -EROFS) && (fd != -EACCES))) return(fd); openflags->w = 0; fd = os_open_file(file, *openflags, mode); if(fd < 0) return(fd); - } + } err = os_lock_file(fd, openflags->w); if(err < 0){ @@ -1238,7 +1101,7 @@ cow_sizes(version, size, sectorsize, align, *bitmap_offset_out, bitmap_len_out, data_offset_out); - return(fd); + return(fd); out_close: os_close_file(fd); return(err); @@ -1249,7 +1112,6 @@ unsigned long *bitmap_len_out, int *data_offset_out) { int err, fd; - flags.c = 1; fd = open_ubd_file(cow_file, &flags, NULL, NULL, NULL, NULL, NULL); if(fd < 0){ @@ -1269,89 +1131,6 @@ return(err); } -static int update_bitmap(struct io_thread_req *req) -{ - int n; - - if(req->cow_offset == -1) - return(0); - - n = os_seek_file(req->fds[1], req->cow_offset); - if(n < 0){ - printk("do_io - bitmap lseek failed : err = %d\n", -n); - return(1); - } - - n = os_write_file(req->fds[1], &req->bitmap_words, - sizeof(req->bitmap_words)); - if(n != sizeof(req->bitmap_words)){ - printk("do_io - bitmap update failed, err = %d fd = %d\n", -n, - req->fds[1]); - return(1); - } - - return(0); -} - -void do_io(struct io_thread_req *req) -{ - char *buf; - unsigned long len; - int n, nsectors, start, end, bit; - int err; - __u64 off; - - nsectors = req->length / req->sectorsize; - start = 0; - do { - bit = ubd_test_bit(start, (unsigned char *) &req->sector_mask); - end = start; - while((end < nsectors) && - (ubd_test_bit(end, (unsigned char *) - &req->sector_mask) == bit)) - end++; - - off = req->offset + req->offsets[bit] + - start * req->sectorsize; - len = (end - start) * req->sectorsize; - buf = &req->buffer[start * req->sectorsize]; - - err = os_seek_file(req->fds[bit], off); - if(err < 0){ - printk("do_io - lseek failed : err = %d\n", -err); - req->error = 1; - return; - } - if(req->op == UBD_READ){ - n = 0; - do { - buf = &buf[n]; - len -= n; - n = os_read_file(req->fds[bit], buf, len); - if (n < 0) { - printk("do_io - read failed, err = %d " - "fd = %d\n", -n, req->fds[bit]); - req->error = 1; - return; - } - } while((n < len) && (n != 0)); - if (n < len) memset(&buf[n], 0, len - n); - } else { - n = os_write_file(req->fds[bit], buf, len); - if(n != len){ - printk("do_io - write failed err = %d " - "fd = %d\n", -n, req->fds[bit]); - req->error = 1; - return; - } - } - - start = end; - } while(start < nsectors); - - req->error = update_bitmap(req); -} - /* Changed in start_io_thread, which is serialized by being called only * from ubd_init, which is an initcall. */ @@ -1360,30 +1139,100 @@ /* Only changed by the io thread */ int io_count = 0; +#define SECTORSIZE 512ll int io_thread(void *arg) { - struct io_thread_req req; - int n; - + struct uml_iovec vector[MAX_SG]; ignore_sigwinch_sig(); - while(1){ - n = os_read_file(kernel_fd, &req, sizeof(req)); - if(n != sizeof(req)){ - if(n < 0) - printk("io_thread - read failed, fd = %d, " - "err = %d\n", kernel_fd, -n); - else { - printk("io_thread - short read, fd = %d, " - "length = %d\n", kernel_fd, n); + while (1) { + struct scatterlist *sg; + struct request *req; + struct ubd *ubd; + struct uml_iovec *v; + struct io_thread_request request; + struct io_thread_reply reply; + int count = 0, nsegs = 0, n, fd; + unsigned long total_request = 0, total_done = 0; + + if ((n = os_read_file(kernel_fd, &request, sizeof request)) + != sizeof request) { + printk("io_thread - os_read_file returned %d, should " + "be %d\n", n, sizeof request); + return 0; + } + + req = request.req; + nsegs = request.nsegs; + sg = request.sg; + + reply.req = req; + reply.error = 0; + + ubd = (struct ubd*)req->rq_disk->private_data; + fd = ubd->fd; + + for (count = 0; count < nsegs; ++count) { + vector[count].iov_base = page_address(sg[count].page) + + sg[count].offset; + vector[count].iov_len = sg[count].length; + total_request += sg[count].length; + } + v = vector; + + if ((n = os_seek_file(fd, req->sector * SECTORSIZE)) < 0) { + printk("io_thread - lseek failed, fd = %d, err = %d\n", + fd, -n); + reply.error = 1; + reply.sectors = req->nr_sectors; + goto finish; + } +again: + + if (rq_data_dir(req)) + n = os_writev(fd, v, count); + else + n = os_readv(fd, v, count); + + if (n >= 0) { + + if (n % SECTORSIZE) + printk("io_thread - readv/writev returned " + "after completing non-integer number " + "of sectors (%d bytes = %d sectors + " + "%d bytes), fd = %d", + n, n / SECTORSIZE, n % SECTORSIZE, fd); + total_done += n; + if (total_done < total_request) { + while (n >= v->iov_len) { + n -= v->iov_len; + --count; + ++v; + } + if (count) { + v->iov_base += n; + v->iov_len -= n; + goto again; + } else + printk("io_thread - %d bytes remain " + "after advancing through all " + "iovecs\n", n); } - continue; + } else { + printk("io_thread - readv/writev failed, fd = %d, " + "err = %d\n", fd, -n); + reply.error = 1; } - io_count++; - do_io(&req); - n = os_write_file(kernel_fd, &req, sizeof(req)); - if(n != sizeof(req)) - printk("io_thread - write failed, fd = %d, err = %d\n", - kernel_fd, -n); + reply.sectors = total_done / SECTORSIZE; +finish: + + if (blk_barrier_rq(req)) + fdatasync(fd); /* error handling? */ + + if ((n = os_write_file(kernel_fd, &reply, sizeof reply)) + != sizeof reply) + printk("os_write_file returned %d, should be %d\n", + n, sizeof reply); + /* XXX success */ } } diff -urN linux-2.6.14.3-vanilla/arch/um/drivers/ubd_user.c linux-2.6.14.3-20051130-newubd/arch/um/drivers/ubd_user.c --- linux-2.6.14.3-vanilla/arch/um/drivers/ubd_user.c 2005-11-24 22:10:21.000000000 +0000 +++ linux-2.6.14.3-20051130-newubd/arch/um/drivers/ubd_user.c 2005-11-30 00:30:33.000000000 +0000 @@ -31,34 +31,35 @@ signal(SIGWINCH, SIG_IGN); } -int start_io_thread(unsigned long sp, int *fd_out) +int start_io_thread(unsigned long sp) { int pid, fds[2], err; + extern int kernel_fd, thread_fd; - err = os_pipe(fds, 1, 1); - if(err < 0){ - printk("start_io_thread - os_pipe failed, err = %d\n", -err); - goto out; + /* First thread. */ + if (kernel_fd == -1) { + /* Use a datagram socket pair so that we can have >1 thread. */ + err = socketpair(AF_UNIX, SOCK_DGRAM, 0, fds); + if(err == -1) { + printk("start_io_thread - socketpair failed, err = %d\n", errno); + kernel_fd = -1; + thread_fd = -1; + err = -errno; + goto out; + } + kernel_fd = fds[0]; + thread_fd = fds[1]; } - kernel_fd = fds[0]; - *fd_out = fds[1]; - - pid = clone(io_thread, (void *) sp, CLONE_FILES | CLONE_VM | SIGCHLD, - NULL); - if(pid < 0){ + pid = clone(io_thread, (void *)sp, CLONE_FILES | CLONE_VM | SIGCHLD, NULL); + if (pid < 0){ printk("start_io_thread - clone failed : errno = %d\n", errno); err = -errno; - goto out_close; + goto out; } return(pid); - out_close: - os_close_file(fds[0]); - os_close_file(fds[1]); - kernel_fd = -1; - *fd_out = -1; out: return(err); } diff -urN linux-2.6.14.3-vanilla/arch/um/include/os.h linux-2.6.14.3-20051130-newubd/arch/um/include/os.h --- linux-2.6.14.3-vanilla/arch/um/include/os.h 2005-11-24 22:10:21.000000000 +0000 +++ linux-2.6.14.3-20051130-newubd/arch/um/include/os.h 2005-11-30 00:38:53.000000000 +0000 @@ -154,6 +154,14 @@ extern int os_file_mode(char *file, struct openflags *mode_out); extern int os_lock_file(int fd, int excl); +struct uml_iovec { + char *iov_base; + unsigned long iov_len; +}; + +extern int os_readv(int fd, const struct uml_iovec *v, int count); +extern int os_writev(int fd, const struct uml_iovec *v, int count); + /* start_up.c */ extern void os_early_checks(void); extern int can_do_skas(void); diff -urN linux-2.6.14.3-vanilla/arch/um/include/ubd_user.h linux-2.6.14.3-20051130-newubd/arch/um/include/ubd_user.h --- linux-2.6.14.3-vanilla/arch/um/include/ubd_user.h 2005-11-24 22:10:21.000000000 +0000 +++ linux-2.6.14.3-20051130-newubd/arch/um/include/ubd_user.h 2005-11-30 00:35:02.000000000 +0000 @@ -8,7 +8,7 @@ #define __UM_UBD_USER_H extern void ignore_sigwinch_sig(void); -extern int start_io_thread(unsigned long sp, int *fds_out); +extern int start_io_thread(unsigned long sp); extern int io_thread(void *arg); extern int kernel_fd; diff -urN linux-2.6.14.3-vanilla/arch/um/os-Linux/file.c linux-2.6.14.3-20051130-newubd/arch/um/os-Linux/file.c --- linux-2.6.14.3-vanilla/arch/um/os-Linux/file.c 2005-11-24 22:10:21.000000000 +0000 +++ linux-2.6.14.3-20051130-newubd/arch/um/os-Linux/file.c 2005-11-30 00:30:33.000000000 +0000 @@ -628,7 +628,8 @@ struct flock lock = ((struct flock) { .l_type = type, .l_whence = SEEK_SET, .l_start = 0, - .l_len = 0 } ); + .l_len = 0, + .l_pid = 0} ); int err, save; err = fcntl(fd, F_SETLK, &lock); @@ -648,6 +649,86 @@ return(err); } +/* Scatter/gather IO. We need to pass the *system's* struct iovec to + * readv/writev, not the kernel's, so we need to translate our internal + * version of struct iovec -- struct uml_iovec. */ +#define NIOVEC 32 + +static struct iovec *iovec_advance(int n, struct iovec *vector, int *c) +{ + int count = *c; + while (n >= vector->iov_len) { + n -= vector->iov_len; + ++vector; + --count; + } + if (count) { + vector->iov_len -= n; + vector->iov_base += n; + } + *c = count; + return vector; +} + +static int file_sgio(int fd, const struct uml_iovec *v, int count, + int (*sgio_proc)(int, const struct iovec*, int), + int (*copy_user_proc)(void*, void*, int)) +{ + int i, ret, n; + struct iovec *vector, vv[NIOVEC]; + size_t len; + + for (i = 0, len = 0; i < count && i < NIOVEC; ++i) { + /* XXX check range */ + vv[i].iov_base = v[i].iov_base; + vv[i].iov_len = v[i].iov_len; + len += v[i].iov_len; + } + + count = i; + vector = vv; + + /* This is nasty. We could take a fault on any of the buffers in the + * list. If it is the first buffer in the list, then writev will give + * EFAULT immediately; otherwise, a short write will occur. In the + * latter case we should re-do the write having adjusted the buffers + * appropriately. */ + ret = -1; + do { + n = sgio_proc(fd, vector, count); + if (n == -1) { + n = -errno; + if (n == -EFAULT) { + /* Fault in first buffer. */ + int err; + err = fault_buffer(vector->iov_base, vector->iov_len, copy_user_proc); + if (err) n = -err; + } + } else { + if (ret == -1) ret = 0; + ret += n; + if (ret < len) { + /* Short write. Advance vector appropriately. */ + ret += n; + vector = iovec_advance(n, vector, &count); + } + } + } while (n == -EINTR || n == -EFAULT || ret < len); + +out: + return ret; +} + +int os_readv(int fd, const struct uml_iovec *v, int count) +{ + return file_sgio(fd, v, count, readv, copy_from_user_proc); +} + +int os_writev(int fd, const struct uml_iovec *v, int count) +{ + return file_sgio(fd, v, count, writev, copy_to_user_proc); +} + /* * Overrides for Emacs so that we follow Linus's tabbing style. * Emacs will notice this stuff at the end of the file and automatically