Storage

Some important points of EXT4 write flow

1. Change file to iov.


ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
{
    struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
    struct kiocb kiocb;
    ssize_t ret;

    init_sync_kiocb(&kiocb, filp);
    kiocb.ki_pos = *ppos;
    kiocb.ki_left = len;
    kiocb.ki_nbytes = len;

    ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
    if (-EIOCBQUEUED == ret)
        ret = wait_on_sync_kiocb(&kiocb);
    *ppos = kiocb.ki_pos;
    return ret;
}

2. Change iov to iov_iter


static inline void iov_iter_init(struct iov_iter *i,
            const struct iovec *iov, unsigned long nr_segs,
            size_t count, size_t written)
{
    i->iov = iov;
    i->nr_segs = nr_segs;
    i->iov_offset = 0;
    i->count = count + written;

    iov_iter_advance(i, written);
}

3. Change iov_iter to page.

copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);


static ssize_t generic_perform_write(struct file *file, struct iov_iter *i, loff_t pos){

    struct address_space *mapping = file->f_mapping;
    const struct address_space_operations *a_ops = mapping->a_ops;
    long status = 0;
    ssize_t written = 0;
    unsigned int flags = 0;

    if (segment_eq(get_fs(), KERNEL_DS))
        flags |= AOP_FLAG_UNINTERRUPTIBLE;

    do {
        struct page *page;
        unsigned long offset;    /* Offset into pagecache page */
        unsigned long bytes;    /* Bytes to write to page */
        size_t copied;        /* Bytes copied from user */
        void *fsdata;

        offset = (pos & (PAGE_CACHE_SIZE - 1));
        bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, iov_iter_count(i) );
again:
        if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
            status = -EFAULT;
            break;
        }

        status = a_ops->write_begin(file, mapping, pos, bytes, flags,
                        &page, &fsdata);
        if (unlikely(status))
            break;

        if (mapping_writably_mapped(mapping))
            flush_dcache_page(page);

        pagefault_disable();
        copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
        pagefault_enable();
        flush_dcache_page(page);

    } while (iov_iter_count(i));
    return written ? written : status;
}

4. page to buffer

head = create_page_buffers(page, inode, 0);


int __block_write_begin(struct page *page, loff_t pos, unsigned len,
        get_block_t *get_block)
{
    unsigned from = pos & (PAGE_CACHE_SIZE - 1);
    unsigned to = from + len;
    struct inode *inode = page->mapping->host;
    unsigned block_start, block_end;
    sector_t block;
    int err = 0;
    unsigned blocksize, bbits;
    struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;

    BUG_ON(!PageLocked(page));
    BUG_ON(from > PAGE_CACHE_SIZE);
    BUG_ON(to > PAGE_CACHE_SIZE);
    BUG_ON(from > to);

    head = create_page_buffers(page, inode, 0);

}

5. page to bio

    memset(&io_submit, 0, sizeof(io_submit));
    ret = ext4_bio_write_page(&io_submit, page, len, wbc);
    ext4_io_submit(&io_submit);


static int ext4_writepage(struct page *page,
              struct writeback_control *wbc)
{
    int ret = 0;
    loff_t size;
    unsigned int len;
    struct buffer_head *page_bufs = NULL;
    struct inode *inode = page->mapping->host;
    struct ext4_io_submit io_submit;

    trace_ext4_writepage(page);
    size = i_size_read(inode);
    if (page->index == size >> PAGE_CACHE_SHIFT)
        len = size & ~PAGE_CACHE_MASK;
    else
        len = PAGE_CACHE_SIZE;

    page_bufs = page_buffers(page);
    /*
     * We cannot do block allocation or other extent handling in this
     * function. If there are buffers needing that, we have to redirty
     * the page. But we may reach here when we do a journal commit via
     * journal_submit_inode_data_buffers() and in that case we must write
     * allocated buffers to achieve data=ordered mode guarantees.
     */
    if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
                   ext4_bh_delay_or_unwritten)) {
        redirty_page_for_writepage(wbc, page);
        if (current->flags & PF_MEMALLOC) {
            /*
             * For memory cleaning there's no point in writing only
             * some buffers. So just bail out. Warn if we came here
             * from direct reclaim.
             */
            WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD))
                            == PF_MEMALLOC);
            unlock_page(page);
            return 0;
        }
    }

    if (PageChecked(page) && ext4_should_journal_data(inode))
        /*
         * It's mmapped pagecache.  Add buffers and journal it.  There
         * doesn't seem much point in redirtying the page here.
         */
        return __ext4_journalled_writepage(page, len);

    memset(&io_submit, 0, sizeof(io_submit));
    ret = ext4_bio_write_page(&io_submit, page, len, wbc);
    ext4_io_submit(&io_submit);
    return ret;
}

6.cfq ioprio setting

int ioprio = cic->icq.ioc->ioprio;


static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
{
    int ioprio = cic->icq.ioc->ioprio;
    struct cfq_data *cfqd = cic_to_cfqd(cic);
    struct cfq_queue *cfqq;

    /*
     * Check whether ioprio has changed.  The condition may trigger
     * spuriously on a newly created cic but there's no harm.
     */
    if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))
        return;

    cfqq = cic->cfqq[BLK_RW_ASYNC];
    if (cfqq) {
        struct cfq_queue *new_cfqq;
        new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio,
                     GFP_ATOMIC);
        if (new_cfqq) {
            cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
            cfq_put_queue(cfqq);
        }
    }

    cfqq = cic->cfqq[BLK_RW_SYNC];
    if (cfqq)
        cfq_mark_cfqq_prio_changed(cfqq);

    cic->ioprio = ioprio;
}