linux內核塊層有kyber、mq-deadline以及bfq三個針對multi queue設計的調度器,這篇文章主要是講解mq-deadline調度器的原理和源碼,分析的內核版本是4.20。
原理
mq-deadline調度器是根據以前的deadline調度器來的,適配了block層的多隊列,基本原理和代碼都差不多,因此如果熟悉deadline調度器的話,mq-deadline調度器也不在話下。
mq-deadline調度器將IO分為read和write兩種類型,對於這每種類型的IO有一棵紅黑樹和一個fifo的隊列,紅黑樹用於將IO按照其訪問的LBA排列方便查找合並,fifo隊列則記錄了io進入mq-deadline調度器的順序,以提供超時期限的保障。
read類型的IO可以搶write的分發機會,但不可一直搶,有一個計數保證read不會餓死write。
mq-deadline調度器會優先去批量式地分發IO而不去管IO的到期時間,當批量分發到一定的個數再關心到期時間,然后去分發即將到期的IO。
最后mq-deadline針對穿透性IO這種需要盡快發送到設備的IO設置另外一個dispatch隊列,然后每次派發的時候都優先派發dispatch隊列上的IO。

數據結構
一個塊設備對應一個deadline_data,上述的隊列也都只存在於deadline_data中。
struct deadline_data {
/*
* run time data
*/
/*
* requests (deadline_rq s) are present on both sort_list and fifo_list
*/
// 這里的數組的下標分別是read(0)和write(1)
struct rb_root sort_list[2]; // IO在紅黑樹上按照LBA排列
struct list_head fifo_list[2]; // IO在fifo隊列上按照進入mq-deadline的先后排列
/*
* next in sort order. read, write or both are NULL
*/
struct request *next_rq[2]; // 批量分發時記錄下一個要分發的IO
unsigned int batching; // 批量分發的個數,分發到一定個數就查看到期時間情況
unsigned int starved; // read讓write餓的次數,不能超過writes_startved
/*
* settings that change how the i/o scheduler behaves
*/
int fifo_expire[2]; // read和write的到期時間
int fifo_batch; // 批量分發的IO個數的最大值,默認值是16
int writes_starved; // read讓write餓的最大次數,默認值是2
int front_merges; // 是否可以向前合並,在bio合並到mq-deadline的時候使用
spinlock_t lock;
spinlock_t zone_lock; //
struct list_head dispatch; // 優先派發的隊列,穿透性IO
};
初始化
Init的時候注冊調度器
mq-deadline模塊初始化注冊調度器,包含了mq-deadline的名稱、別名、模塊、屬性以及最重要的操作集。
static int __init deadline_init(void)
{
return elv_register(&mq_deadline);
}
注冊的調度器如下:
static struct elevator_type mq_deadline = {
.ops.mq = {
.insert_requests = dd_insert_requests, // 插入request
.dispatch_request = dd_dispatch_request, // 分發request
.prepare_request = dd_prepare_request,
.finish_request = dd_finish_request, // request結束時調用
.next_request = elv_rb_latter_request, // 找到當前request的前一個request
.former_request = elv_rb_former_request, // 找到當前request的后一個request
.bio_merge = dd_bio_merge, // bio合並到mq-deadline的時候調用
.request_merge = dd_request_merge, // 找到一個可以將bio合並進去的request
.requests_merged = dd_merged_requests, // 兩個request合並后調用
.request_merged = dd_request_merged, // 將bio合並到request后調用
.has_work = dd_has_work,
.init_sched = dd_init_queue, // 初始化
.exit_sched = dd_exit_queue,
},
.uses_mq = true,
#ifdef CONFIG_BLK_DEBUG_FS
.queue_debugfs_attrs = deadline_queue_debugfs_attrs,
#endif
.elevator_attrs = deadline_attrs,
.elevator_name = "mq-deadline",
.elevator_alias = "deadline",
.elevator_owner = THIS_MODULE,
};
初始化mq-deadline
當啟用mq-deadline調度器時會調用其初始化函數dd_init_queue()。
static int dd_init_queue(struct request_queue *q, struct elevator_type *e)
{
struct deadline_data *dd;
struct elevator_queue *eq;
eq = elevator_alloc(q, e);
if (!eq)
return -ENOMEM;
dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
if (!dd) {
kobject_put(&eq->kobj);
return -ENOMEM;
}
eq->elevator_data = dd; // 關聯deadline_data和elevator_queue
INIT_LIST_HEAD(&dd->fifo_list[READ]);
INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
dd->sort_list[READ] = RB_ROOT;
dd->sort_list[WRITE] = RB_ROOT; // 初始化隊列
dd->fifo_expire[READ] = read_expire; // 設置讀寫的超時時間
dd->fifo_expire[WRITE] = write_expire;
dd->writes_starved = writes_starved;
dd->front_merges = 1; // 初始化是1,還未找到在哪里修改的
dd->fifo_batch = fifo_batch; // 批量分發的IO個數上限
spin_lock_init(&dd->lock);
spin_lock_init(&dd->zone_lock);
INIT_LIST_HEAD(&dd->dispatch); // 初始化dispatch隊列
q->elevator = eq; // 關聯elevator_queue和request_queue
return 0;
}
bio合並如mq-deadline
到達block層的bio會首先嘗試與現有的request進行合並,如果合並不成功再生成request,嘗試合並調用的hook是bio_merge,對mq-deadline來說就是dd_bio_merge()函數。
static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
{
struct request_queue *q = hctx->queue;
struct deadline_data *dd = q->elevator->elevator_data;
struct request *free = NULL;
bool ret;
spin_lock(&dd->lock);
// 調用通用處理流程的合並函數,詳解這個函數不在本文范圍內,
// 該函數會依次調用hook: request_merge找到一個可以跟bio合並的request
// requests_merged 嘗試合並了兩個相鄰的request之后調用
// request_merged當request成功地與bio或者某個相鄰的request合並之后調用
ret = blk_mq_sched_try_merge(q, bio, &free);
spin_unlock(&dd->lock);
if (free)
blk_mq_free_request(free);
return ret;
}
dd_request_merge()函數:
// 找到一個可以跟bio合並的request, 參數rq返回這個request
static int dd_request_merge(struct request_queue *q, struct request **rq,
struct bio *bio)
{
struct deadline_data *dd = q->elevator->elevator_data;
sector_t sector = bio_end_sector(bio);
struct request *__rq;
// front_merges默認是1
if (!dd->front_merges)
return ELEVATOR_NO_MERGE;
// 從紅黑樹中找到一個起始LBA是bio的結束LBA的request,表明bio可以front merge到request
// bio_data_dir表明bio是read還是write
__rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
if (__rq) {
BUG_ON(sector != blk_rq_pos(__rq));
// 檢查是否可以合並
if (elv_bio_merge_ok(__rq, bio)) {
*rq = __rq;
return ELEVATOR_FRONT_MERGE;
}
}
return ELEVATOR_NO_MERGE;
}
當bio成功地合入進了一個request之后,request的請求返回增大了,elv會檢查request的前一個request或者后一個request,嘗試將這兩個request進行合並,那么如何找到當前request的前一個或者后一個呢?
部分調度器在注冊的時候會注冊next_request和former_request兩個hook,用於找到request的后一個或者前一個request,對於mq-deadline來說這兩個函數分別為elv_rb_latter_request()和elv_rb_former_request(),這兩個函數都是elv的通用函數,通過按照LBA排列的紅黑樹,可以很方便地找到當前request的前一個或者后一個request。
struct request *elv_rb_latter_request(struct request_queue *q,
struct request *rq)
{
struct rb_node *rbnext = rb_next(&rq->rb_node);
if (rbnext)
return rb_entry_rq(rbnext);
return NULL;
}
struct request *elv_rb_former_request(struct request_queue *q,
struct request *rq)
{
struct rb_node *rbprev = rb_prev(&rq->rb_node);
if (rbprev)
return rb_entry_rq(rbprev);
return NULL;
}
dd_merged_requests()函數
// 將兩個request合並后通知調度器
static void dd_merged_requests(struct request_queue *q, struct request *req,
struct request *next)
{
/*
* if next expires before rq, assign its expire time to rq
* and move into next position (next will be deleted) in fifo
*/
if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
if (time_before((unsigned long)next->fifo_time,
(unsigned long)req->fifo_time)) {
// next是被合並的那個IO,
// 當next的到期時間在req之前時,要將req的到期時間提前到next一樣,這樣保證next的到期時間,並且要調整req在fifo隊列里的位置為next的位置
list_move(&req->queuelist, &next->queuelist);
req->fifo_time = next->fifo_time;
}
}
/*
* kill knowledge of next, this one is a goner
*/
// 將next移除出mq-deadline,因為它已經跟req合並了
deadline_remove_request(q, next);
}
將request移除出mq-deadline:
static inline void
deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
{
const int data_dir = rq_data_dir(rq);
// 如果移除的是批量下發的下一個request,那么需要找到下一個批量下發的request
if (dd->next_rq[data_dir] == rq)
dd->next_rq[data_dir] = deadline_latter_request(rq);
// 從紅黑樹移除
elv_rb_del(deadline_rb_root(dd, rq), rq);
}
/*
* remove rq from rbtree and fifo.
*/
static void deadline_remove_request(struct request_queue *q, struct request *rq)
{
struct deadline_data *dd = q->elevator->elevator_data;
// 從fifo隊列移除
list_del_init(&rq->queuelist);
/*
* We might not be on the rbtree, if we are doing an insert merge
*/
if (!RB_EMPTY_NODE(&rq->rb_node))
deadline_del_rq_rb(dd, rq); // 從紅黑樹移除
elv_rqhash_del(q, rq);
if (q->last_merge == rq)
q->last_merge = NULL;
}
deadline_latter_request()函數:
static inline struct request *
deadline_latter_request(struct request *rq)
{
// 紅黑樹是按照LBA的順序排列的,因此可以輕松地找到下一個LBA順序的IO
struct rb_node *node = rb_next(&rq->rb_node);
if (node)
return rb_entry_rq(node);
return NULL;
}
最后調用dd_request_merged()函數:
static void dd_request_merged(struct request_queue *q, struct request *req,
enum elv_merge type)
{
struct deadline_data *dd = q->elevator->elevator_data;
/*
* if the merge was a front merge, we need to reposition request
*/
// request完成了合並(無論是bio合並到了request還是與其他的某個request合並,亦或者都發生了),此時request的起始LBA發生了變化,因此在紅黑樹上的位置也要發生改變
if (type == ELEVATOR_FRONT_MERGE) {
// 先將request取下
elv_rb_del(deadline_rb_root(dd, req), req);
// 再插入到新的位置
deadline_add_rq_rb(dd, req);
}
}
deadline_add_rq_rb()函數
static void
deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
{
struct rb_root *root = deadline_rb_root(dd, rq);
elv_rb_add(root, rq);
}
request插入到mq-deadline
當bio不能合並到現有的request時,會生成一個新的request,然后將request插入到mq-deadline中等待調度。
dd_insert_requests()函數:
static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
struct list_head *list, bool at_head)
{
struct request_queue *q = hctx->queue;
struct deadline_data *dd = q->elevator->elevator_data;
spin_lock(&dd->lock);
while (!list_empty(list)) {
struct request *rq;
// 將request取出
rq = list_first_entry(list, struct request, queuelist);
list_del_init(&rq->queuelist);
// 插入到mq-deadline
dd_insert_request(hctx, rq, at_head);
}
spin_unlock(&dd->lock);
}
dd_insert_request()函數:
static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
bool at_head)
{
struct request_queue *q = hctx->queue;
struct deadline_data *dd = q->elevator->elevator_data;
const int data_dir = rq_data_dir(rq);
/*
* This may be a requeue of a write request that has locked its
* target zone. If it is the case, this releases the zone lock.
*/
// 插入之前如果是持有了zone write lock的要解鎖,關於zone看下面的簡短解釋
blk_req_zone_write_unlock(rq);
if (blk_mq_sched_try_insert_merge(q, rq))
return;
blk_mq_sched_request_inserted(rq);
// 對於穿透性IO,直接插入到dispatch隊列
if (at_head || blk_rq_is_passthrough(rq)) {
if (at_head)
list_add(&rq->queuelist, &dd->dispatch);
else
list_add_tail(&rq->queuelist, &dd->dispatch);
} else {
// 普通io插入到mq-deadline的紅黑樹
deadline_add_rq_rb(dd, rq);
if (rq_mergeable(rq)) {
elv_rqhash_add(q, rq);
if (!q->last_merge)
q->last_merge = rq;
}
/*
* set expire time and add to fifo list
*/
// 設置request的超時時間
rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
// 將request同時插入到fifo隊列
list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
}
}
這里插入一下關於“zoned block device”,為了提高存儲密度,有些設備被划分為若干個區域,這些區域的寫IO只能順序地寫,不能隨機寫,能夠隨機讀。因此這個區域的寫IO必須是順序的,mq-deadline的調度方式會打破IO來的順序,因此需要在各個流程特殊處理zoned的情況。
// 最后將request插入到mq-deadline的紅黑樹
static void
deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
{
struct rb_root *root = deadline_rb_root(dd, rq);
elv_rb_add(root, rq);
}
mq-deadline分發request
啟動硬件隊列派發后會調用調度器注冊的hook:dispatch_request接口獲得一個可以派發的request,mq-deadline注冊的派發函數為:dd_dispatch_request()。
static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
{
struct deadline_data *dd = hctx->queue->elevator->elevator_data;
struct request *rq;
// 由於mq-deadline是全局(塊設備局域內)的,因此這里需要加鎖
spin_lock(&dd->lock);
rq = __dd_dispatch_request(dd);
spin_unlock(&dd->lock);
return rq;
}
__dd_dispatch_request()函數:
static struct request *__dd_dispatch_request(struct deadline_data *dd)
{
struct request *rq, *next_rq;
bool reads, writes;
int data_dir;
// 如果dispatch隊列不為空,則優先派發dispatch隊列上的穿透性IO
if (!list_empty(&dd->dispatch)) {
rq = list_first_entry(&dd->dispatch, struct request, queuelist);
list_del_init(&rq->queuelist);
goto done;
}
// 看read隊列和write隊列是否是空的
reads = !list_empty(&dd->fifo_list[READ]);
writes = !list_empty(&dd->fifo_list[WRITE]);
/*
* batches are currently reads XOR writes
*/
// 批量派發,按照request的LBA順序依次派發,先嘗試從write隊列獲取批量派發的下一個request,獲取不到則嘗試從read隊列獲取。
rq = deadline_next_request(dd, WRITE);
if (!rq)
rq = deadline_next_request(dd, READ);
// 如果批量派發的個數在規定限制內,則可以派發,否則需要檢查fifo有無到期的IO
if (rq && dd->batching < dd->fifo_batch)
/* we have a next request are still entitled to batch */
goto dispatch_request;
/*
* at this point we are not running a batch. select the appropriate
* data direction (read / write)
*/
// 優先看read隊列是否由到期的IO
if (reads) {
BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
// 如果write隊列有到期的IO,並且read讓write“飢餓”的次數超過了2次,則去派發write
// 否則派發read
if (deadline_fifo_request(dd, WRITE) &&
(dd->starved++ >= dd->writes_starved))
goto dispatch_writes;
data_dir = READ;
goto dispatch_find_request;
}
/*
* there are either no reads or writes have been starved
*/
// 派發write,並將starved置為0,表示read又可以搶占write了
if (writes) {
dispatch_writes:
BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));
dd->starved = 0;
data_dir = WRITE;
goto dispatch_find_request;
}
return NULL;
// 無論是派發read還是write,都走到這里
dispatch_find_request:
/*
* we are not running a batch, find best request for selected data_dir
*/
// 先獲取批量派發的下一個request
next_rq = deadline_next_request(dd, data_dir);
// 如果隊列有到期的IO,或者批量派發沒有下一個IO了則從fifo隊列里取出第一個IO來派發
if (deadline_check_fifo(dd, data_dir) || !next_rq) {
/*
* A deadline has expired, the last request was in the other
* direction, or we have run out of higher-sectored requests.
* Start again from the request with the earliest expiry time.
*/
rq = deadline_fifo_request(dd, data_dir);
} else {
/*
* The last req was the same dir and we have a next request in
* sort order. No expired requests so continue on from here.
*/
// 否則派發批量IO
rq = next_rq;
}
/*
* For a zoned block device, if we only have writes queued and none of
* them can be dispatched, rq will be NULL.
*/
if (!rq)
return NULL;
dd->batching = 0;
dispatch_request:
/*
* rq is the selected appropriate request.
*/
dd->batching++;
deadline_move_request(dd, rq); // 將IO從mq-deadline移除
done:
/*
* If the request needs its target zone locked, do it.
*/
// 如果訪問的是zoned區域,則必須加上zoned鎖
blk_req_zone_write_lock(rq);
rq->rq_flags |= RQF_STARTED;
return rq;
}
deadline_next_request()函數:
// 找到批量派發的下一個IO
static struct request *
deadline_next_request(struct deadline_data *dd, int data_dir)
{
struct request *rq;
unsigned long flags;
if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
return NULL;
// 批量派發的下一個IO已經被置上了
rq = dd->next_rq[data_dir];
if (!rq)
return NULL;
// 如果批量派發的是讀IO,或者是寫但是這個寫沒有zoned屬性,則直接派發
if (data_dir == READ || !blk_queue_is_zoned(rq->q))
return rq;
/*
* Look for a write request that can be dispatched, that is one with
* an unlocked target zone.
*/
// 如果是寫IO,則需要看是否加了zoned鎖,如果沒有加則不能派發,找到一個加了zoned鎖的寫IO派發
// 因為zoned device區域的寫IO必須是順序的,不能打亂順序
spin_lock_irqsave(&dd->zone_lock, flags);
while (rq) {
if (blk_req_can_dispatch_to_zone(rq))
break;
rq = deadline_latter_request(rq);
}
spin_unlock_irqrestore(&dd->zone_lock, flags);
return rq;
}
deadline_check_fifo()函數:
static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
{
// 判斷fifo隊列的第一個IO是否過期
struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next);
/*
* rq is expired!
*/
if (time_after_eq(jiffies, (unsigned long)rq->fifo_time))
return 1;
return 0;
}
deadline_fifo_request()函數:
static struct request *
deadline_fifo_request(struct deadline_data *dd, int data_dir)
{
struct request *rq;
unsigned long flags;
if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
return NULL;
if (list_empty(&dd->fifo_list[data_dir]))
return NULL;
// 跟deadline_next_request的解釋一樣的
rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
if (data_dir == READ || !blk_queue_is_zoned(rq->q))
return rq;
/*
* Look for a write request that can be dispatched, that is one with
* an unlocked target zone.
*/
spin_lock_irqsave(&dd->zone_lock, flags);
list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) {
if (blk_req_can_dispatch_to_zone(rq))
goto out;
}
rq = NULL;
out:
spin_unlock_irqrestore(&dd->zone_lock, flags);
return rq;
}
request結束
request結束時調用:
static void dd_finish_request(struct request *rq)
{
struct request_queue *q = rq->q;
// 寫訪問結束要釋放zoned鎖,如果有必要的話
if (blk_queue_is_zoned(q)) {
struct deadline_data *dd = q->elevator->elevator_data;
unsigned long flags;
spin_lock_irqsave(&dd->zone_lock, flags);
blk_req_zone_write_unlock(rq);
spin_unlock_irqrestore(&dd->zone_lock, flags);
}
}
mq-deadline退出
static void dd_exit_queue(struct elevator_queue *e)
{
struct deadline_data *dd = e->elevator_data;
BUG_ON(!list_empty(&dd->fifo_list[READ]));
BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
// 釋放掉init時申請的內存
kfree(dd);
}
總結
mq-deadline適合於對實時性敏感的程序,它能保證IO在規定的時間內能下發下去,如果下層設備不出問題那么就能保證IO能夠在期限內做出響應。
