概述
本文主要來討論Linux Block子系統中的IO調度層。我們知道應用層發起磁盤數據訪問時內核並不會立即將請求下發到磁盤的驅動程序中進行響應,而是做適當的延遲,嘗試能否擴展之前請求的磁盤范圍來滿足該請求。這樣做的好處也很明顯,以機械硬盤為例,訪問不同位置的數據是通過磁頭的移動實現的,如果下發給驅動程序的請求是按照磁頭移動的方向進行了排序,那么磁盤只需要按照特定的方向連續的訪問數據即可響應這些請求,節省了磁頭移動定位的時間。對IO請求進行排序和並就是IO調度層的主要工作,由於這種機制很像我們顯示生活中的電梯(只朝着一個方向運行),因此IO調度層所使用的算法也被統稱為電梯調度算法。
數據結構
IO調度層涉及到的數據結構主要為兩種,request表示IO請求,由通用塊層的bio初始化或者合並得到;request_queue表示請求隊列,包含了對一個塊設備的所有request。下面我們來看一下這兩種數據結構中主要的成員。
struct request {
#ifdef __GENKSYMS__
union {
struct list_head queuelist;
struct llist_node ll_list;
};
#else
struct list_head queuelist;
#endif
union {
struct call_single_data csd;
RH_KABI_REPLACE(struct work_struct mq_flush_work,
unsigned long fifo_time)
};
struct request_queue *q;
struct blk_mq_ctx *mq_ctx;
u64 cmd_flags;
enum rq_cmd_type_bits cmd_type;
unsigned long atomic_flags;
int cpu;
/* the following two fields are internal, NEVER access directly */
unsigned int __data_len; /* total data len */
sector_t __sector; /* sector cursor */
struct bio *bio;
struct bio *biotail;
#ifdef __GENKSYMS__
struct hlist_node hash; /* merge hash */
#else
/*
* The hash is used inside the scheduler, and killed once the
* request reaches the dispatch list. The ipi_list is only used
* to queue the request for softirq completion, which is long
* after the request has been unhashed (and even removed from
* the dispatch list).
*/
union {
struct hlist_node hash; /* merge hash */
struct list_head ipi_list;
};
#endif
/*
* The rb_node is only used inside the io scheduler, requests
* are pruned when moved to the dispatch queue. So let the
* completion_data share space with the rb_node.
*/
union {
struct rb_node rb_node; /* sort/lookup */
void *completion_data;
};
/*
* Three pointers are available for the IO schedulers, if they need
* more they have to dynamically allocate it. Flush requests are
* never put on the IO scheduler. So let the flush fields share
* space with the elevator data.
*/
union {
struct {
struct io_cq *icq;
void *priv[2];
} elv;
struct {
unsigned int seq;
struct list_head list;
rq_end_io_fn *saved_end_io;
} flush;
};
struct gendisk *rq_disk;
struct hd_struct *part;
unsigned long start_time;
#ifdef CONFIG_BLK_CGROUP
struct request_list *rl; /* rl this rq is alloced from */
unsigned long long start_time_ns;
unsigned long long io_start_time_ns; /* when passed to hardware */
#endif
/* Number of scatter-gather DMA addr+len pairs after
* physical address coalescing is performed.
*/
unsigned short nr_phys_segments;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
unsigned short nr_integrity_segments;
#endif
unsigned short ioprio;
void *special; /* opaque pointer available for LLD use */
char *buffer; /* kaddr of the current segment if available */
int tag;
int errors;
/*
* when request is used as a packet command carrier
*/
unsigned char __cmd[BLK_MAX_CDB];
unsigned char *cmd;
unsigned short cmd_len;
unsigned int extra_len; /* length of alignment and padding */
unsigned int sense_len;
unsigned int resid_len; /* residual count */
void *sense;
unsigned long deadline;
struct list_head timeout_list;
unsigned int timeout;
int retries;
/*
* completion callback.
*/
rq_end_io_fn *end_io;
void *end_io_data;
/* for bidi */
struct request *next_rq;
}
struct request_queue {
struct list_head queue_head;
struct request *last_merge;
struct elevator_queue *elevator;
int nr_rqs[2]; /* # allocated [a]sync rqs */
int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */
struct request_list root_rl;
request_fn_proc *request_fn;
make_request_fn *make_request_fn;
prep_rq_fn *prep_rq_fn;
merge_bvec_fn *merge_bvec_fn;
softirq_done_fn *softirq_done_fn;
rq_timed_out_fn *rq_timed_out_fn;
dma_drain_needed_fn *dma_drain_needed;
lld_busy_fn *lld_busy_fn;
RH_KABI_CONST struct blk_mq_ops *mq_ops;
unsigned int *mq_map;
/* sw queues */
RH_KABI_REPLACE(struct blk_mq_ctx *queue_ctx,
struct blk_mq_ctx __percpu *queue_ctx)
unsigned int nr_queues;
/* hw dispatch queues */
struct blk_mq_hw_ctx **queue_hw_ctx;
unsigned int nr_hw_queues;
sector_t end_sector;
struct request *boundary_rq;
struct delayed_work delay_work;
struct backing_dev_info backing_dev_info;
void *queuedata;
unsigned long queue_flags;
int id;
gfp_t bounce_gfp;
spinlock_t __queue_lock;
spinlock_t *queue_lock;
struct kobject kobj;
struct kobject mq_kobj;
#ifdef CONFIG_PM_RUNTIME
struct device *dev;
int rpm_status;
unsigned int nr_pending;
#endif
unsigned long nr_requests; /* Max # of requests */
unsigned int nr_congestion_on;
unsigned int nr_congestion_off;
unsigned int nr_batching;
unsigned int dma_drain_size;
void *dma_drain_buffer;
unsigned int dma_pad_mask;
unsigned int dma_alignment;
struct blk_queue_tag *queue_tags;
struct list_head tag_busy_list;
unsigned int nr_sorted;
unsigned int in_flight[2];
unsigned int request_fn_active;
unsigned int rq_timeout;
struct timer_list timeout;
struct list_head timeout_list;
struct list_head icq_list;
#ifdef CONFIG_BLK_CGROUP
DECLARE_BITMAP (blkcg_pols, BLKCG_MAX_POLS);
struct blkcg_gq *root_blkg;
struct list_head blkg_list;
#endif
struct queue_limits limits;
unsigned int sg_timeout;
unsigned int sg_reserved_size;
int node;
#ifdef CONFIG_BLK_DEV_IO_TRACE
struct blk_trace *blk_trace;
#endif
/*
* for flush operations
*/
unsigned int flush_flags;
unsigned int flush_not_queueable:1;
RH_KABI_DEPRECATE(unsigned int, flush_queue_delayed:1)
RH_KABI_DEPRECATE(unsigned int, flush_pending_idx:1)
RH_KABI_DEPRECATE(unsigned int, flush_running_idx:1)
RH_KABI_DEPRECATE(unsigned long, flush_pending_since)
RH_KABI_DEPRECATE(struct list_head, flush_queue[2])
RH_KABI_DEPRECATE(struct list_head, flush_data_in_flight)
RH_KABI_DEPRECATE(struct request *, flush_rq)
RH_KABI_DEPRECATE(spinlock_t, mq_flush_lock)
struct mutex sysfs_lock;
int bypass_depth;
#if defined(CONFIG_BLK_DEV_BSG)
bsg_job_fn *bsg_job_fn;
int bsg_job_size;
struct bsg_class_device bsg_dev;
#endif
#ifdef CONFIG_BLK_DEV_THROTTLING
/* Throttle data */
struct throtl_data *td;
#endif
struct rcu_head rcu_head;
wait_queue_head_t mq_freeze_wq;
RH_KABI_DEPRECATE(struct percpu_counter, mq_usage_counter)
struct list_head all_q_node;
RH_KABI_EXTEND(unprep_rq_fn *unprep_rq_fn)
RH_KABI_EXTEND(struct blk_mq_tag_set *tag_set)
RH_KABI_EXTEND(struct list_head tag_set_list)
RH_KABI_EXTEND(struct list_head requeue_list)
RH_KABI_EXTEND(spinlock_t requeue_lock)
/* requeue_work's type is changed from 'work_struct' to 'delayed_work' below */
RH_KABI_EXTEND(struct work_struct rh_reserved_requeue_work)
RH_KABI_EXTEND(atomic_t mq_freeze_depth)
RH_KABI_EXTEND(struct blk_flush_queue *fq)
RH_KABI_EXTEND(struct percpu_ref q_usage_counter)
RH_KABI_EXTEND(bool mq_sysfs_init_done)
RH_KABI_EXTEND(struct work_struct timeout_work)
RH_KABI_EXTEND(struct delayed_work requeue_work)
RH_KABI_EXTEND(struct blk_queue_stats *stats)
RH_KABI_EXTEND(struct blk_stat_callback *poll_cb)
RH_KABI_EXTEND(struct blk_rq_stat poll_stat[2])
RH_KABI_EXTEND(atomic_t shared_hctx_restart)
RH_KABI_EXTEND(unsigned int queue_depth)
RH_KABI_EXTEND(unsigned int front_queue:1)
RH_KABI_EXTEND(unsigned int tail_queue:1)
/* This flag is set if the driver can split bio */
RH_KABI_EXTEND(unsigned int can_split_bio:1)
#ifdef CONFIG_BLK_DEBUG_FS
RH_KABI_EXTEND(struct dentry *debugfs_dir)
RH_KABI_EXTEND(struct dentry *sched_debugfs_dir)
#endif
#ifdef CONFIG_BLK_DEV_IO_TRACE
RH_KABI_EXTEND(struct mutex blk_trace_mutex)
#endif
RH_KABI_EXTEND(init_rq_fn *init_rq_fn)
RH_KABI_EXTEND(exit_rq_fn *exit_rq_fn)
RH_KABI_EXTEND(size_t cmd_size)
RH_KABI_EXTEND(void *rq_alloc_data)
}
IO調度
IO調度層的核心就是IO請求的調度,涉及到IO請求合並與下發的時間點,如何提高IO請求效率等問題。接下來我們將討論IO請求如何合並,IO調度層的調度器。
IO請求合並
在討論IO請求如何合並前,需要明確為什么需要進行IO請求合並。前文已經說過合並有利於提高磁盤IO效率,具體時如何體現呢?假設現在現在應用層發起了一個數據訪問請求,請求的扇區為5~10;此時另一個用戶進程緊跟着也發起了一個請求,訪問的扇區時1~4,在沒有IO請求合並的情況下,驅動程序驅動磁盤讀取5~10扇區的數據后將磁頭移動回第1扇區,接着讀取1~4扇區數據,這樣做使得有一部分時間浪費在了磁頭移動上(實際上磁頭移動時間比數據訪問時間要大)。因此Linux中引入了IO請求合並,將第一個請求向后延遲,第二個請求出現后發現可以和第一個請求合並,直接訪問1~10扇區的數據,這樣就節省了磁頭移動的時。
Linux中IO請求的合並也比較復雜,首先我們來參考一張前人總結的IO合並圖,圖中描述了應用層發起數據訪問請求后IO請求經過了哪些merge的點,最終下發到塊設備的請求隊列中。

圖1 IO合並
如圖1所示,對IO請求進行merge的點有三處:
- Cache:緩存機制層面的合並,本文不詳細討論。
- Plug List:蓄流泄流機制,進程會進行蓄流操作,將本進程的IO請求合並,並在適當的時機將合並后的請求泄流進調度隊列中。
- Elevator Queue:在調度器將請求添加到調度隊列時會進行IO請求的合並。
從圖中我們可以看到應用層發起數據請求后不管是經過Cache還是跳過Cache機制,都需要經過上述三個merge點中的至少一個,這樣就保證最終下發給驅動的請求經過了最大可能的合並和排序。接下來我們詳細討論一下蓄流泄流機制以及IO調度層的調度器,理解IO請求時如何進行合並的。
蓄流泄流
plug/unplug即蓄流和泄流,是Linux中提高IO合並效率的一種機制。從圖1中可以看到一部分IO請求在被添加到調度隊列中時會先經過Plug List一層,然后經過unplug操作才會被添加到調度隊列中,這整個過程被稱為蓄流和泄流。該機制主要涉及到的數據結構只有一種:
struct blk_plug {
unsigned long magic; /* detect uninitialized use-cases */
struct list_head list; /* plug中的請求鏈表表頭 */
struct list_head mq_list; /* blk-mq requests */
struct list_head cb_list; /* md requires an unplug callback */
};
每個進程都會有一個Plug List(如果支持plug機制),上層下發的IOi請求進入Plug List后都會被鏈入list成員指向的鏈表中。因為同一個進程中的IO請求訪問磁盤中相鄰連續扇區的可能性更大,所以在Plug List中IO請求合並的幾率也更大,而且沒有把IO請求直接添加到調度隊列中,這也使得調度隊列的負荷降低。蓄流泄流機制的通用流程如圖2所示:

圖2 plug流程
如果支持蓄流泄流機制,內核一般會在IO調度層之上開啟蓄流機制,並將bio提交給調度層並添加到Plug List中,最后進行泄流操作將Plug List中的request提交給調度器,由調度器進行進一步的合並和排序並添加到調度器的調度隊列中。
圖2中調用blk_start_plug開啟蓄流,最后調用blk_finish_plug進行泄流操作,這兩個函數一般都是成對出現。我們先來看一下blk_start_plug函數:
void blk_start_plug(struct blk_plug *plug)
{
struct task_struct *tsk = current;
plug->magic = PLUG_MAGIC;
INIT_LIST_HEAD(&plug->list);
INIT_LIST_HEAD(&plug->mq_list);
INIT_LIST_HEAD(&plug->cb_list);
/*
* If this is a nested plug, don't actually assign it. It will be
* flushed on its own.
*/
if (!tsk->plug) {
/*
* Store ordering should not be needed here, since a potential
* preempt will imply a full memory barrier
*/
tsk->plug = plug;
}
}
該函數功能比較簡單,在創建並初始化一個plug之后將其添加到當前進程描述符current的plug字段中,后續該進程訪問Plug List都是通過訪問current->plug的方式實現的。此處也體現了前文所說的每個進程都有一個Plug List。
接着我們來分析一下泄流操作:
void blk_finish_plug(struct blk_plug *plug)
{
blk_flush_plug_list(plug, false);
if (plug == current->plug)
current->plug = NULL;
}
從代碼中我們可以看到泄流操作調用了blk_flush_plug_list將plug中的request泄流到調度隊列中,並設置current->plug字段為NULL。我們接着往下分析blk_flush_plug_list時如何實現泄流的。
void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
{
...
if (list_empty(&plug->list))
return;
list_splice_init(&plug->list, &list);
list_sort(NULL, &list, plug_rq_cmp);
q = NULL;
depth = 0;
local_irq_save(flags);
while (!list_empty(&list)) {
rq = list_entry_rq(list.next);
list_del_init(&rq->queuelist);
BUG_ON(!rq->q);
if (rq->q != q) {
if (q)
queue_unplugged(q, depth, from_schedule);
q = rq->q;
depth = 0;
spin_lock(q->queue_lock);
}
...
elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
depth++;
}
if (q)
queue_unplugged(q, depth, from_schedule);
local_irq_restore(flags);
}
以上是blk_flush_plug_list核心的代碼,主要是將Plug List中的request通過elv_add_request函數提交給調度器,由調度器進行排序和進一步的合並,最后調用queue_unplugged,該函數會調用塊設備驅動程序提供的request響應函數,從調度隊列中獲取request進行數據數據傳輸,至此泄流操作完成。
從上面的分析中可以看出泄流最關鍵的一步就是調用queue_unplugged函數將IO請求下發給驅動程序,由驅動程序進行數據傳輸。我們來看看這個函數做了哪些操作:
static void queue_unplugged(struct request_queue *q, unsigned int depth,
bool from_schedule)
__releases(q->queue_lock)
{
trace_block_unplug(q, depth, !from_schedule);
if (from_schedule)
blk_run_queue_async(q);
else
__blk_run_queue(q);
spin_unlock(q->queue_lock);
}
這個函數通過判斷from_schedule參數決定執行什么操作。這個參數的取值由兩種:Ture和False,Ture表示異步執行,調用blk_run_queue_async異步地將IO請求下發給驅動程序;False表示同步執行,調用__blk_run_queue立即將調度隊列中的IO請求下發給驅動程序。上文我們提到的blk_finish_plug最終在調用該函數時傳遞的參數就是False,即立即將IO請求下發給驅動程序進行響應,以及在新的request添加到Plug List的過程中發現Plug List已滿,也會以同步的方式unplug,本文不再詳細講述,感興趣的讀者可以去查閱相關源碼。但是這種方法有一個弊端就是需要等待IO傳輸完成,我們都知道IO操作是比較耗費CPU時間的,因此可能會造成進程的阻塞。異步操作就不會由這種問題,因此Linux內核中也提供了異步unplug的機制。
異步unplug是由工作隊列實現的,在Linux初始化Block子系統時會為每個CPU都創建一個kblockd的工作隊列,該隊列的主要功能由blk_delay_work函數實現,並由定時器定時觸發該工作隊列:
static void blk_delay_work(struct work_struct *work)
{
struct request_queue *q;
q = container_of(work, struct request_queue, delay_work.work);
spin_lock_irq(q->queue_lock);
__blk_run_queue(q);
spin_unlock_irq(q->queue_lock);
}
blk_delay_work最終也是調用了__blk_run_queue下發IO請求。那什么時候會進行異步unplug呢?在進程切換時調用schedule函數,該函數會調用blk_schedule_flush_plug,blk_schedule_plug在調用blk_flush_plug_list時傳遞的就是true,表示異步unplug。
static inline void blk_schedule_flush_plug(struct task_struct *tsk)
{
struct blk_plug *plug = tsk->plug;
if (plug)
blk_flush_plug_list(plug, true);
}
我們來總結一下Linux內核在何時進行蓄流與泄流。
- 蓄流
- 調用blk_start_plug初始化Plug List
- 泄流
- 調用blk_finish_plug同步unplug,將request下發到驅動程序
- 新的request添加到Plug List中發現Plug List已滿,調用blk_flush_plug_list同步unplug
- 進程切換時schedule調用blk_flush_plug_list異步unplug,最終定時器定時觸發kblockd將request下發給驅動程序
調度器
從圖1中可以看到,不管IO請求經過哪條路徑,最后都會匯聚到調度器,由調度器統一下發給驅動程序進行處理,因此我們需要來探討一下IO調度器的工作原理。
Linux中提供了多種調度器策略供用戶選擇,用戶可以通過/sys接口動態調整調度器策略。Linux中主要的調度器策略有如下幾種:
- Noop算法:IO調度器最簡單的算法,將IO請求放入隊列中並順序的執行這些IO請求,對於連續的IO請求也會做相應的合並。
- Deadline算法:保證IO請求在一定時間內能夠被服務,避免某個請求飢餓
- Anticipatory算法:心是局部性原理,它期望一個進程做完一次IO請求后還會繼續在此處做IO請求
- CFQ算法:即絕對公平算法,試圖為競爭塊設備使用權的所有進程分配一個請求隊列和一個時間片,在調度器分配給進程的時間片內,進程可以將其讀寫請求發送給底層塊設備,當進程的時間片消耗完,進程的請求隊列將被掛起,等待調度。
Linux通過將調度器抽象化提出了一個統一的規范接口,各個調度器算法通過該接口向調度器注冊,這樣便有利於調度器策略的擴展和切換。我們首先來了解一下調度器規范接口。Linux中調度器的接口由struct elevator_ops結構定義:
struct elevator_ops
{
elevator_merge_fn *elevator_merge_fn;
elevator_merged_fn *elevator_merged_fn;
elevator_merge_req_fn *elevator_merge_req_fn;
elevator_allow_merge_fn *elevator_allow_merge_fn;
elevator_bio_merged_fn *elevator_bio_merged_fn;
elevator_dispatch_fn *elevator_dispatch_fn;
elevator_add_req_fn *elevator_add_req_fn;
elevator_activate_req_fn *elevator_activate_req_fn;
elevator_deactivate_req_fn *elevator_deactivate_req_fn;
elevator_completed_req_fn *elevator_completed_req_fn;
elevator_request_list_fn *elevator_former_req_fn;
elevator_request_list_fn *elevator_latter_req_fn;
elevator_init_icq_fn *elevator_init_icq_fn; /* see iocontext.h */
elevator_exit_icq_fn *elevator_exit_icq_fn; /* ditto */
elevator_set_req_fn *elevator_set_req_fn;
elevator_put_req_fn *elevator_put_req_fn;
elevator_may_queue_fn *elevator_may_queue_fn;
elevator_init_fn *elevator_init_fn;
elevator_exit_fn *elevator_exit_fn;
};
elevator_ops結構定義了將IO請求添加到調度隊列以及從調度隊列中將IO請求下發給驅動的函數和相關數據結構。例如DeadLine調度算法定義了如下接口供調度器使用:
static struct elevator_type iosched_deadline = {
.ops = {
.elevator_merge_fn = deadline_merge,
.elevator_merged_fn = deadline_merged_request,
.elevator_merge_req_fn = deadline_merged_requests,
.elevator_dispatch_fn = deadline_dispatch_requests,
.elevator_add_req_fn = deadline_add_request,
.elevator_former_req_fn = elv_rb_former_request,
.elevator_latter_req_fn = elv_rb_latter_request,
.elevator_init_fn = deadline_init_queue,
.elevator_exit_fn = deadline_exit_queue,
},
.elevator_attrs = deadline_attrs,
.elevator_name = "deadline",
.elevator_owner = THIS_MODULE,
};
名字中帶有merge字段的方法是將IO請求通過不同的形式添加到調度器的調度隊列中,具體實現本文不做詳細講述,我們主要來分析一下IO請求時如何從調度隊列中下發給驅動的。以DeadLine算法為例,根據調度算法的策略計算得到當前最優的IO請求,並調用deadline_dispath_request函數將該請求調出調度隊列。該函數經過層層調用,最終會調用elv_dispatch_add_tail將IO請求調度出隊列:
void elv_dispatch_add_tail(struct request_queue *q, struct request *rq)
{
if (q->last_merge == rq)
q->last_merge = NULL;
elv_rqhash_del(q, rq);
q->nr_sorted--;
q->end_sector = rq_end_sector(rq);
q->boundary_rq = rq;
list_add_tail(&rq->queuelist, &q->queue_head);
}
我們可以從上面的代碼中看到,該函數更新調度隊列的信息后將IO請求rq添加到了塊設備請求隊列q的queuelist字段中。在稍后的時間里內核會進行unplug操作,最終驅動程序會從請求隊列的queuelist字段中讀取IO請求進行響應。
