nvme 驅動詳解[轉]


nvme 驅動詳解 之1
http://blog.csdn.net/qqqqqq999999/article/details/47732319

首先打開driver/block下的kconfig文件,其中定義了BLK_DEV_NVMEconfig,如下。

config BLK_DEV_NVME

     tristate "NVMExpress block device"

     depends on PCI

     ---help---

       The NVM Express driver is for solid statedrives directly

       connected to the PCI or PCI Express bus.  If you know you

       don't have one of these, it is safe to answerN.



       To compile this driver as a module, choose Mhere: the

       module will be called nvme.

通過console,輸入makemenuconfig,搜索BLK_DEV_NEME得到如下依賴關系。

Symbol: BLK_DEV_NVME [=m]

| Type : tristate

| Prompt: NVM Express block device

| Location:

| -> Device Drivers

| (1) -> Block devices (BLK_DEV [=y])

| Defined at drivers/block/Kconfig:313

| Dependson: BLK_DEV [=y] && PCI [=y]

可以看到nemv 依賴於BLK和PCI 。

打開driver/block/Makefile,搜索NVME,可以看到:

obj-$(CONFIG_BLK_DEV_NVME) += nvme.o

nvme-y := nvme-core.o nvme-scsi.o

關於和BLK相關的文件,打開block/Makefile:

obj-$(CONFIG_BLOCK) := bio.oelevator.o blk-core.o blk-tag.o blk-sysfs.o \

                        blk-flush.o blk-settings.o blk-ioc.oblk-map.o \

                        blk-exec.o blk-merge.o blk-softirq.oblk-timeout.o \

                        blk-iopoll.o blk-lib.o blk-mq.oblk-mq-tag.o \

                        blk-mq-sysfs.o blk-mq-cpu.oblk-mq-cpumap.o ioctl.o \

                        genhd.o scsi_ioctl.o partition-generic.oioprio.o \

                        partitions/

哇塞,是不是很多?不要擔心,NVME也只是用了BLOCK層的一些函數而已,不用把所用與BLOCK相關的文件都看了,除非你有精力去研究。

好了,到目前為止,我們知道了要看哪些文件了,nvme-core.cnvme-scsi.c是必須的,剩下的就是當我們的driver調用到block層哪些函數再去研究。

打開nvme-core,查看入口函數,module_init(nvme_init);

static int __init nvme_init(void)

{

     int result;



     init_waitqueue_head(&nvme_kthread_wait);//創建等待隊列



     nvme_workq =create_singlethread_workqueue("nvme");//創建工作隊列

     if (!nvme_workq)

               return -ENOMEM;



     result= register_blkdev(nvme_major, "nvme");//注冊塊設備

     if (result < 0)

               goto kill_workq;

     else if (result > 0)

               nvme_major = result;



     result= pci_register_driver(&nvme_driver);//注冊pci driver

     if (result)

               goto unregister_blkdev;

     return 0;

unregister_blkdev:

     unregister_blkdev(nvme_major, "nvme");

kill_workq:

     destroy_workqueue(nvme_workq);

     return result;

}

注冊pci driver后,會調用nvme_driver中的probe函數。發現開始總是美好的,函數是如此的簡潔,不要高興的太早,痛苦的經歷正在逼近。

static int nvme_probe(struct pci_dev*pdev, const struct pci_device_id *id)

{

     int node, result = -ENOMEM;

     struct nvme_dev *dev;



     node = dev_to_node(&pdev->dev);//獲取node節點,與NUMA系統有關。

     if (node == NUMA_NO_NODE)

               set_dev_node(&pdev->dev, 0);



     dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);

     if (!dev)

               return -ENOMEM;

     dev->entry = kzalloc_node(num_possible_cpus() *sizeof(*dev->entry),//分配msix-entry

                                                             GFP_KERNEL,node);

     if (!dev->entry)

               goto free;

     dev->queues = kzalloc_node((num_possible_cpus() + 1) *sizeof(void *),//分配queues 資源,

                                                             GFP_KERNEL,node);//這里之所以多1,是因為有admin-queues

     if (!dev->queues)

               goto free;



     INIT_LIST_HEAD(&dev->namespaces);//初始化namespaces鏈表。

     dev->reset_workfn = nvme_reset_failed_dev;

     INIT_WORK(&dev->reset_work, nvme_reset_workfn);

     dev->pci_dev = pci_dev_get(pdev);

     pci_set_drvdata(pdev, dev);

     result = nvme_set_instance(dev);//設置pci設備的句柄instance,代表該設備。

     if (result)

               goto put_pci;



     result = nvme_setup_prp_pools(dev);//設置dma需要的prp內存池。

     if (result)

               goto release;



     kref_init(&dev->kref);

     result = nvme_dev_start(dev);//創建admin queue、 io queue 、request irq

     if (result)

               goto release_pools;



     if (dev->online_queues > 1)

               result = nvme_dev_add(dev);//初始化mq,並增加一個實際可用的nvme dev,並且admin_queue可以發送cmd。

     if (result)

               goto shutdown;



     scnprintf(dev->name, sizeof(dev->name),"nvme%d", dev->instance);

     dev->miscdev.minor = MISC_DYNAMIC_MINOR;

     dev->miscdev.parent = &pdev->dev;

     dev->miscdev.name = dev->name;

     dev->miscdev.fops = &nvme_dev_fops;

     result = misc_register(&dev->miscdev);//注冊一個misc設備

     if (result)

               goto remove;



     nvme_set_irq_hints(dev);



     dev->initialized = 1;

     return 0;

remove:

     nvme_dev_remove(dev);

     nvme_dev_remove_admin(dev);

     nvme_free_namespaces(dev);

shutdown:

     nvme_dev_shutdown(dev);

release_pools:

     nvme_free_queues(dev, 0);

     nvme_release_prp_pools(dev);

release:

     nvme_release_instance(dev);

put_pci:

     pci_dev_put(dev->pci_dev);

free:

     kfree(dev->queues);

     kfree(dev->entry);

     kfree(dev);

     return result;

}

上面每一個主要功能的函數都簡單了注釋了一下,描述了做的哪些工作,下面具體看看那些函數怎么實現的。

static int nvme_set_instance(structnvme_dev *dev)

{

     int instance, error;



     do {

               if (!ida_pre_get(&nvme_instance_ida,GFP_KERNEL))

                        return -ENODEV;



               spin_lock(&dev_list_lock);

               error = ida_get_new(&nvme_instance_ida,&instance);

               spin_unlock(&dev_list_lock);

     } while (error == -EAGAIN);



     if (error)

               return -ENODEV;



     dev->instance = instance;//該函數獲得設備的instance,相當於該設備的id,代表着該設備。

     return 0;

}

Nvme_setup_prp_pools用來創建dma時所用的內存池,prp_page_pool是虛擬內核地址,

static int nvme_setup_prp_pools(structnvme_dev *dev)

{

     struct device *dmadev = &dev->pci_dev->dev;

     dev->prp_page_pool = dma_pool_create("prp listpage", dmadev,

                                                    PAGE_SIZE,PAGE_SIZE, 0);

     if (!dev->prp_page_pool)

               return -ENOMEM;



     /* Optimisation for I/Os between 4k and 128k */

     dev->prp_small_pool = dma_pool_create("prp list256", dmadev,

                                                    256, 256, 0);

     if (!dev->prp_small_pool) {

               dma_pool_destroy(dev->prp_page_pool);

               return -ENOMEM;

     }

     return 0;

}

下面是一個重量級的函數之一,nvme_dev_start;

static intnvme_dev_start(struct nvme_dev *dev)

{

     int result;

     bool start_thread = false;



     result = nvme_dev_map(dev);

     if (result)

               return result;



     result = nvme_configure_admin_queue(dev);//配置adminsubmit queue 和complete queue,64 depth

     if (result)

               goto unmap;



     spin_lock(&dev_list_lock);

     if (list_empty(&dev_list) &&IS_ERR_OR_NULL(nvme_thread)) {

               start_thread = true;

               nvme_thread = NULL;

     }

     list_add(&dev->node, &dev_list);

     spin_unlock(&dev_list_lock);



     if (start_thread) {

               nvme_thread =  kthread_run(nvme_kthread, NULL, "nvme");

               wake_up_all(&nvme_kthread_wait);

     } else

               wait_event_killable(nvme_kthread_wait,nvme_thread);



     if (IS_ERR_OR_NULL(nvme_thread)) {

               result = nvme_thread ? PTR_ERR(nvme_thread) :-EINTR;

               goto disable;

     }



     nvme_init_queue(dev->queues[0],0);//初始化queue,並online_queues++

     result = nvme_alloc_admin_tags(dev);

     if (result)

               goto disable;



     result = nvme_setup_io_queues(dev);

     if (result)

               goto free_tags;



     nvme_set_irq_hints(dev);



     return result;

free_tags:

     nvme_dev_remove_admin(dev);

disable:

     nvme_disable_queue(dev, 0);

     nvme_dev_list_remove(dev);

unmap:

     nvme_dev_unmap(dev);

     return result;

}

首先看nvme_configure_admin_queue(dev) 這個函數。

static intnvme_configure_admin_queue(struct nvme_dev *dev)

{

     int result;

     u32 aqa;

     u64 cap = readq(&dev->bar->cap);//讀cap寄存器

     struct nvme_queue *nvmeq;

     unsigned page_shift = PAGE_SHIFT;

     unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12;

     unsigned dev_page_max = NVME_CAP_MPSMAX(cap) + 12;



     if (page_shift < dev_page_min) {

               dev_err(&dev->pci_dev->dev,

                                 "Minimum device page size(%u) too large for "

                                 "host (%u)\n", 1<< dev_page_min,

                                 1 << page_shift);

               return -ENODEV;

     }

     if (page_shift > dev_page_max) {

               dev_info(&dev->pci_dev->dev,

                                 "Device maximum page size(%u) smaller than "

                                 "host (%u); enablingwork-around\n",

                                 1 << dev_page_max, 1<< page_shift);

               page_shift = dev_page_max;

     }



     result = nvme_disable_ctrl(dev, cap);//disable controller

     if (result < 0)

               return result;



     nvmeq = dev->queues[0];

     if (!nvmeq) {

               nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);//如果nvmeq==null,就創建nvmeq

               if (!nvmeq)

                        return -ENOMEM;

     }



     aqa = nvmeq->q_depth - 1;

     aqa |= aqa << 16;



     dev->page_size = 1 << page_shift;



     dev->ctrl_config = NVME_CC_CSS_NVM;

     dev->ctrl_config |= (page_shift - 12) <<NVME_CC_MPS_SHIFT;

     dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;

     dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;



     writel(aqa, &dev->bar->aqa);

     writeq(nvmeq->sq_dma_addr, &dev->bar->asq);

     writeq(nvmeq->cq_dma_addr, &dev->bar->acq);  //該語句是創建nvmeq的submit queue和complete queue



     result = nvme_enable_ctrl(dev, cap);

     if (result)

               goto free_nvmeq;



     nvmeq->cq_vector = 0;

     result = queue_request_irq(dev, nvmeq, nvmeq->irqname);//注冊中斷

     if (result)

               goto free_nvmeq;



     return result;

free_nvmeq:

     nvme_free_queues(dev, 0);

     return result;

}

下面看一下在nvme_alloc_queue函數中作了什么。

static struct nvme_queue *nvme_alloc_queue(structnvme_dev *dev, int qid,

                                                             intdepth)

{

     struct device *dmadev = &dev->pci_dev->dev;

     struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq),GFP_KERNEL);

     if (!nvmeq)

               return NULL;



     nvmeq->cqes = dma_zalloc_coherent(dmadev, CQ_SIZE(depth),

                                             &nvmeq->cq_dma_addr, GFP_KERNEL); //分配complete queue cmds空間,深度為depth個。

     if (!nvmeq->cqes)

               goto free_nvmeq;



     nvmeq->sq_cmds = dma_alloc_coherent(dmadev,SQ_SIZE(depth),

                                           &nvmeq->sq_dma_addr,GFP_KERNEL);//分配submit queue中cmds空間,深度為depth個。

     if (!nvmeq->sq_cmds)

               goto free_cqdma;



     nvmeq->q_dmadev = dmadev;

     nvmeq->dev = dev;

     snprintf(nvmeq->irqname, sizeof(nvmeq->irqname),"nvme%dq%d",

                        dev->instance, qid);//設置nvmeq的irqname

     spin_lock_init(&nvmeq->q_lock);

     nvmeq->cq_head = 0;

     nvmeq->cq_phase = 1;

     nvmeq->q_db = &dev->dbs[qid * 2 *dev->db_stride];

     nvmeq->q_depth = depth;

     nvmeq->qid = qid;

     dev->queue_count++;

     dev->queues[qid] = nvmeq;//將分配的nvmeq保存在dev->queues[qid]位置



     return nvmeq;//返回得到的nvmeq

free_cqdma:

     dma_free_coherent(dmadev, CQ_SIZE(depth), (void*)nvmeq->cqes,

                                                             nvmeq->cq_dma_addr);

free_nvmeq:

     kfree(nvmeq);

     return NULL;

}

到此,我們完成了admin queue的complete queue和submit queue的創建和中斷的注冊。下面一句是nvme_kthread 守護進程的創建,這個我們稍候再講。我們先看一下下面的函數。

static void nvme_init_queue(structnvme_queue *nvmeq, u16 qid)

{

     struct nvme_dev *dev = nvmeq->dev;



     spin_lock_irq(&nvmeq->q_lock);

     nvmeq->sq_tail = 0;//完成一些nvmeq的初始化工作

     nvmeq->cq_head = 0;

     nvmeq->cq_phase = 1;

     nvmeq->q_db = &dev->dbs[qid * 2 *dev->db_stride];

     memset((void *)nvmeq->cqes, 0,CQ_SIZE(nvmeq->q_depth));

     dev->online_queues++;//將dev->online_queues++,代表online_queues增加1

     spin_unlock_irq(&nvmeq->q_lock);

}

下面的函數時nvme使用mq的核心。

static int nvme_alloc_admin_tags(structnvme_dev *dev)

{

     if (!dev->admin_q) {//初始化admin_q為null,故進入if分支

               dev->admin_tagset.ops = &nvme_mq_admin_ops;//初始化blk_mq_tag_set結構體,nvme_mq_admin_ops在run request會用到

               dev->admin_tagset.nr_hw_queues = 1;//hardware queue個數為1

               dev->admin_tagset.queue_depth = NVME_AQ_DEPTH -1;

               dev->admin_tagset.timeout = ADMIN_TIMEOUT;

               dev->admin_tagset.numa_node =dev_to_node(&dev->pci_dev->dev);

               dev->admin_tagset.cmd_size = sizeof(structnvme_cmd_info);

               dev->admin_tagset.driver_data = dev;



               if (blk_mq_alloc_tag_set(&dev->admin_tagset))//分配一個tag set與一個或多個request queues關聯。

                        return -ENOMEM;



               dev->admin_q =  blk_mq_init_queue(&dev->admin_tagset);//初始化request_queue

               if (IS_ERR(dev->admin_q)) {

                        blk_mq_free_tag_set(&dev->admin_tagset);

                        return -ENOMEM;

               }

               if (!blk_get_queue(dev->admin_q)){

                        nvme_dev_remove_admin(dev);

                        return -ENODEV;

               }

     } else

               blk_mq_unfreeze_queue(dev->admin_q);



     return 0;

}

下面依次介紹blk_mq中相關的函數。

先看張圖,一個mq的schdule.

blk_mq_alloc_tag_set(&dev->admin_tagset)這個函數所做工作可以用下圖簡單概括.

/*

  • Alloc a tag set to be associated with one ormore request queues.

  • May fail with EINVAL for various errorconditions. May adjust the

  • requested depth down, if if it too large. Inthat case, the set

  • value will be stored in set->queue_depth.

*/

int blk_mq_alloc_tag_set(struct blk_mq_tag_set*set)

{

     BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 <<BLK_MQ_UNIQUE_TAG_BITS);



     if (!set->nr_hw_queues)

               return -EINVAL;

     if (!set->queue_depth)

               return -EINVAL;

     if (set->queue_depth < set->reserved_tags +BLK_MQ_TAG_MIN)

               return -EINVAL;



     if (!set->nr_hw_queues || !set->ops->queue_rq ||!set->ops->map_queue)

               return -EINVAL;



     if (set->queue_depth > BLK_MQ_MAX_DEPTH) {

               pr_info("blk-mq: reduced tag depth to%u\n",

                        BLK_MQ_MAX_DEPTH);

               set->queue_depth = BLK_MQ_MAX_DEPTH;

     }



     /*

      * If a crashdump isactive, then we are potentially in a very

      * memory constrainedenvironment. Limit us to 1 queue and

      * 64 tags to preventusing too much memory.

      */

     if (is_kdump_kernel()) {

               set->nr_hw_queues = 1;

               set->queue_depth = min(64U,set->queue_depth);

     }



     set->tags = kmalloc_node(set->nr_hw_queues *    //在這里給tags分配與nr_hw_queues個空間

                                  sizeof(struct blk_mq_tags *),

                                  GFP_KERNEL, set->numa_node);

     if (!set->tags)

               return -ENOMEM;



     if (blk_mq_alloc_rq_maps(set))

               goto enomem;



     mutex_init(&set->tag_list_lock);

     INIT_LIST_HEAD(&set->tag_list);



     return 0;

enomem:

     kfree(set->tags);

     set->tags = NULL;

     return -ENOMEM;

}

/*

  • Allocate the request maps associated withthis tag_set. Note that this

  • may reduce the depth asked for, if memory istight. set->queue_depth

  • will be updated to reflect the allocateddepth.

*/

static int blk_mq_alloc_rq_maps(structblk_mq_tag_set *set)

{

     unsigned int depth;

     int err;



     depth = set->queue_depth;

     do {

               err = __blk_mq_alloc_rq_maps(set);//如果成功,則跳出,set->tags[xxx]等資源初始化完畢,否則,將queue_depth減半,創建。

               if (!err)

                        break;



               set->queue_depth >>= 1;

               if (set->queue_depth < set->reserved_tags+ BLK_MQ_TAG_MIN) {

                        err = -ENOMEM;

                        break;

               }

     } while (set->queue_depth);



     if (!set->queue_depth || err) {

               pr_err("blk-mq: failed to allocate requestmap\n");

               return -ENOMEM;

     }



     if (depth != set->queue_depth)

               pr_info("blk-mq: reduced tag depth (%u ->%u)\n",

                                                    depth,set->queue_depth);



     return 0;

}

static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set*set)

{

     int i;



     for (i = 0; i <set->nr_hw_queues; i++)  {//根據nr_hw_queues循環

               set->tags[i] = blk_mq_init_rq_map(set, i);//初始化tag[i]

               if (!set->tags[i])

                        goto out_unwind;

     }



     return 0;

out_unwind:

     while (--i >= 0)

               blk_mq_free_rq_map(set,set->tags[i], i);



     return -ENOMEM;

}

static structblk_mq_tags *blk_mq_init_rq_map(structblk_mq_tag_set *set,

               unsigned int hctx_idx)

{

     struct blk_mq_tags *tags;

     unsigned int i, j, entries_per_page,max_order = 4;

     size_t rq_size, left;



     tags = blk_mq_init_tags(set->queue_depth,set->reserved_tags,

                                 set->numa_node);//初始化tags

     if (!tags)

               return NULL;



     INIT_LIST_HEAD(&tags->page_list);



     tags->rqs =kzalloc_node(set->queue_depth * sizeof(struct request *),

                                  GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,

                                  set->numa_node);//分配requests資源,每個queue有queue_depth個

     if (!tags->rqs) {

               blk_mq_free_tags(tags);

               return NULL;

     }



     /*

      * rq_size is the size of the request plusdriver payload, rounded

      * to the cacheline size

      */

     rq_size = round_up(sizeof(structrequest) + set->cmd_size,

                                 cache_line_size());//設置request的大小,request大小為request 結構體與cmd_set結構體之和

     left = rq_size * set->queue_depth;



     for (i = 0; i < set->queue_depth;) {

               int this_order = max_order;

               struct page *page;

               int to_do;

               void *p;



               while (left <order_to_size(this_order - 1) && this_order)

                        this_order--;



               do {

                        page =alloc_pages_node(set->numa_node,

                                 GFP_KERNEL| __GFP_NOWARN | __GFP_NORETRY,

                                 this_order);

                        if (page)

                                 break;

                        if (!this_order--)

                                 break;

                        if(order_to_size(this_order) < rq_size)

                                 break;

               } while (1);



               if (!page)

                        goto fail;



               page->private =this_order;

               list_add_tail(&page->lru,&tags->page_list);



               p = page_address(page);

               entries_per_page =order_to_size(this_order) / rq_size;

               to_do = min(entries_per_page,set->queue_depth - i);

               left -= to_do * rq_size;

               for (j = 0; j < to_do;j++) {

                        tags->rqs[i] = p;

                        tags->rqs[i]->atomic_flags= 0;

                        tags->rqs[i]->cmd_flags= 0;

                        if(set->ops->init_request) {

                                 if(set->ops->init_request(set->driver_data,

                                                    tags->rqs[i],hctx_idx, i,

                                                    set->numa_node)){//這里調用init_request初始化request

                                           tags->rqs[i]= NULL;

                                           gotofail;

                                 }

                        }



                        p += rq_size;

                        i++;

               }

     }



     return tags;

fail:

     blk_mq_free_rq_map(set, tags,hctx_idx);

     return NULL;

}

structblk_mq_tags *blk_mq_init_tags(unsignedint total_tags,

                                      unsigned int reserved_tags, int node)

{

     struct blk_mq_tags *tags;



     if (total_tags > BLK_MQ_TAG_MAX) {

               pr_err("blk-mq: tagdepth too large\n");

               return NULL;

     }



     tags = kzalloc_node(sizeof(*tags),GFP_KERNEL, node);//分配tags資源

     if (!tags)

               return NULL;



     tags->nr_tags = total_tags;

     tags->nr_reserved_tags =reserved_tags;



     return blk_mq_init_bitmap_tags(tags,node);//初始化bitmap tags

}

static structblk_mq_tags *blk_mq_init_bitmap_tags(structblk_mq_tags *tags,

                                                       int node)

{

     unsigned int depth = tags->nr_tags -tags->nr_reserved_tags;//depth為總共的tags數-保留的tags數。



     if (bt_alloc(&tags->bitmap_tags,depth, node, false))//初始化bitmap_tags

               goto enomem;

     if(bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true))//初始化breserved_tags

               goto enomem;



     return tags;

enomem:

     bt_free(&tags->bitmap_tags);

     kfree(tags);

     return NULL;

}

static intbt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth,

                        int node, boolreserved)

{

     int i;



     bt->bits_per_word = ilog2(BITS_PER_LONG);//BITS_PER_LONG 定義為64,則bits_per_word=6



     /*

      * Depth can be zero for reserved tags, that'snot a failure

      * condition.

      */

     if (depth) {//此處depth=64

               unsigned int nr,tags_per_word;



               tags_per_word = (1 <<bt->bits_per_word);



               /*

                * If the tag space is small, shrink the numberof tags

                * per word so we spread over a few cachelines,at least.

                * If less than 4 tags, just forget about it,it's not

                * going to work optimally anyway.

                */

               if (depth >= 4) {

                        while (tags_per_word* 4 > depth) {

                                 bt->bits_per_word--;

                                 tags_per_word= (1 << bt->bits_per_word);

                        }

               }



               nr = ALIGN(depth,tags_per_word) / tags_per_word;//align函數是以tags_per_word整數倍對齊,每個word記錄的tags數為tags_per_word,這樣depth個tags需要的word數為nr。

               bt->map = kzalloc_node(nr* sizeof(struct blk_align_bitmap),

                                                    GFP_KERNEL,node);//於是分配nr個map來記錄這個tags

               if (!bt->map)

                        return -ENOMEM;



               bt->map_nr = nr;

     }



     bt->bs = kzalloc(BT_WAIT_QUEUES *sizeof(*bt->bs), GFP_KERNEL);

     if (!bt->bs) {

               kfree(bt->map);

               return -ENOMEM;

     }



     bt_update_count(bt, depth);//更新map中的depth



     for (i = 0; i < BT_WAIT_QUEUES; i++){

               init_waitqueue_head(&bt->bs[i].wait);

               atomic_set(&bt->bs[i].wait_cnt,bt->wake_cnt);

     }



     return 0;

}

/*

     初始化requestqueue

*/

structrequest_queue *blk_mq_init_queue(structblk_mq_tag_set *set)

{

     struct blk_mq_hw_ctx **hctxs;

     struct blk_mq_ctx __percpu *ctx;

     struct request_queue *q;

     unsigned int *map;

     int i;



     ctx = alloc_percpu(struct blk_mq_ctx);//分配ctx結構體空間

     if (!ctx)

               return ERR_PTR(-ENOMEM);



     hctxs =kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,//分配nr hw queue 個hctxs結構體空間

                        set->numa_node);



     if (!hctxs)

               goto err_percpu;



     map = blk_mq_make_queue_map(set);//得到cpu與hwQueued映射map

     if (!map)

               goto err_map;



     for (i = 0; i < set->nr_hw_queues;i++) {

               int node =blk_mq_hw_queue_to_node(map, i);



               hctxs[i] =kzalloc_node(sizeof(struct blk_mq_hw_ctx),//對hctxs[i]的一些屬性進行賦值

                                           GFP_KERNEL,node);

               if (!hctxs[i])

                        goto err_hctxs;



               if(!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL,

                                                    node))

                        goto err_hctxs;



               atomic_set(&hctxs[i]->nr_active,0);

               hctxs[i]->numa_node =node;

               hctxs[i]->queue_num = i;

     }



     q = blk_alloc_queue_node(GFP_KERNEL,set->numa_node);//分配一個request_queue資源,並初始化

     if (!q)

               goto err_hctxs;



     /*

      *Init percpu_ref in atomic mode so that it's faster to shutdown.

      * See blk_register_queue() for details.

      */

     if(percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release,

                            PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))

               goto err_map;



     setup_timer(&q->timeout,blk_mq_rq_timer, (unsigned long) q);

     blk_queue_rq_timeout(q, 30000);



     q->nr_queues = nr_cpu_ids;

     q->nr_hw_queues =set->nr_hw_queues;

     q->mq_map = map;



     q->queue_ctx = ctx;

     q->queue_hw_ctx = hctxs;



     q->mq_ops = set->ops;

     q->queue_flags |=QUEUE_FLAG_MQ_DEFAULT;



     if (!(set->flags &BLK_MQ_F_SG_MERGE))

               q->queue_flags |= 1<< QUEUE_FLAG_NO_SG_MERGE;



     q->sg_reserved_size = INT_MAX;



     INIT_WORK(&q->requeue_work,blk_mq_requeue_work);

     INIT_LIST_HEAD(&q->requeue_list);

     spin_lock_init(&q->requeue_lock);



     if (q->nr_hw_queues > 1)

               blk_queue_make_request(q,blk_mq_make_request);//設置make_request函數指針

     else

               blk_queue_make_request(q,blk_sq_make_request);



     if (set->timeout)

               blk_queue_rq_timeout(q,set->timeout);



     /*

      * Do this after blk_queue_make_request()overrides it...

      */

     q->nr_requests =set->queue_depth;



     if (set->ops->complete)

               blk_queue_softirq_done(q,set->ops->complete);



     blk_mq_init_cpu_queues(q,set->nr_hw_queues);//初始化sw queue



     if (blk_mq_init_hw_queues(q, set))//初始化hw queue

               goto err_hw;



     mutex_lock(&all_q_mutex);

     list_add_tail(&q->all_q_node,&all_q_list);

     mutex_unlock(&all_q_mutex);



     blk_mq_add_queue_tag_set(set, q);



     blk_mq_map_swqueue(q);//映射sw queue



     return q;

err_hw:

     blk_cleanup_queue(q);

err_hctxs:

     kfree(map);

     for (i = 0; i <set->nr_hw_queues; i++) {

               if (!hctxs[i])

                        break;

               free_cpumask_var(hctxs[i]->cpumask);

               kfree(hctxs[i]);

     }

err_map:

     kfree(hctxs);

err_percpu:

     free_percpu(ctx);

     return ERR_PTR(-ENOMEM);

}

該函數用於啟動設置io queue。這時候admin queue已經初始化完畢,可以給adminqueue下發cmd來創建io queue

static int nvme_setup_io_queues(structnvme_dev *dev)

{

     struct nvme_queue *adminq =dev->queues[0];

     struct pci_dev *pdev = dev->pci_dev;

     int result, i, vecs, nr_io_queues,size;



     nr_io_queues = num_possible_cpus();//得到cpu num

     result = set_queue_count(dev,nr_io_queues);//發送set feature cmd 創建io queue

     if (result <= 0)

               return result;

     if (result < nr_io_queues)

               nr_io_queues = result;



     size = db_bar_size(dev, nr_io_queues);//計算所需要的bar size,如果>8192,則需要重新分配bar size 空間。

     if (size > 8192) {

               iounmap(dev->bar);

               do {

                        dev->bar =ioremap(pci_resource_start(pdev, 0), size);

                        if (dev->bar)

                                 break;

                        if (!--nr_io_queues)

                                 return-ENOMEM;

                        size =db_bar_size(dev, nr_io_queues);

               } while (1);

               dev->dbs = ((void __iomem*)dev->bar) + 4096;

               adminq->q_db =dev->dbs;

     }



     /* Deregister the admin queue'sinterrupt */

     free_irq(dev->entry[0].vector,adminq);



     /*

      * If we enable msix early due to not intx,disable it again before

      * setting up the full range we need.

      */

     if (!pdev->irq)

               pci_disable_msix(pdev);



     for (i = 0; i < nr_io_queues; i++)

               dev->entry[i].entry = i;

     vecs = pci_enable_msix_range(pdev,dev->entry, 1, nr_io_queues);//請求msix 范圍

     

     if (vecs < 0) {

               vecs =pci_enable_msi_range(pdev, 1, min(nr_io_queues, 32));

               if (vecs < 0) {

                        vecs = 1;

               } else {

                        for (i = 0; i <vecs; i++)

                                 dev->entry[i].vector= i + pdev->irq;

               }

     }



     /*

      * Should investigate if there's a performancewin from allocating

      * more queues than interrupt vectors; it mightallow the submission

      * path to scale better, even if the receivepath is limited by the

      * number of interrupts.

      */

     nr_io_queues = vecs;

     dev->max_qid = nr_io_queues;



     result = queue_request_irq(dev, adminq,adminq->irqname);

     if (result)

               goto free_queues;



     /* Free previously allocated queuesthat are no longer usable */

     nvme_free_queues(dev, nr_io_queues +1);

     nvme_create_io_queues(dev);//創建io queues



     return 0;

free_queues:

     nvme_free_queues(dev, 1);

     return result;

}

static void nvme_create_io_queues(structnvme_dev *dev)

{

     unsigned i;



     for (i = dev->queue_count; i <=dev->max_qid; i++)

               if (!nvme_alloc_queue(dev, i,dev->q_depth))//分配nvmeq結構體,並記錄到dev->queues[]數組中,並分配submit queue 和complete queue所需要的空間。

                        break;



     for (i = dev->online_queues; i <=dev->queue_count - 1; i++)

               if(nvme_create_queue(dev->queues[i], i))//向admin queue發送cmd創建cq 和sq

                        break;

}

static int nvme_dev_add(structnvme_dev *dev)

{

     struct pci_dev *pdev = dev->pci_dev;

     int res;

     unsigned nn, i;

     struct nvme_ns *ns;

     struct nvme_id_ctrl *ctrl;

     struct nvme_id_ns *id_ns;

     void *mem;

     dma_addr_t dma_addr;

     int shift =NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12;//最大page size



     mem =dma_alloc_coherent(&pdev->dev, 8192, &dma_addr, GFP_KERNEL);//申請8192大小的dma 空間,來存放identify data

     if (!mem)

               return -ENOMEM;



     res = nvme_identify(dev, 0, 1, dma_addr);//發送identify cmd

     if (res) {

               dev_err(&pdev->dev,"Identify Controller failed (%d)\n", res);

               res = -EIO;

               goto out;

     }



     ctrl = mem;

     nn = le32_to_cpup(&ctrl->nn);//獲得namespace number

     dev->oncs =le16_to_cpup(&ctrl->oncs);

     dev->abort_limit = ctrl->acl + 1;

     dev->vwc = ctrl->vwc;

     dev->event_limit = min(ctrl->aerl+ 1, 8);

     memcpy(dev->serial, ctrl->sn,sizeof(ctrl->sn));

     memcpy(dev->model, ctrl->mn,sizeof(ctrl->mn));

     memcpy(dev->firmware_rev,ctrl->fr, sizeof(ctrl->fr));

     if (ctrl->mdts)

               dev->max_hw_sectors = 1<< (ctrl->mdts + shift - 9);

     if ((pdev->vendor ==PCI_VENDOR_ID_INTEL) &&

                        (pdev->device ==0x0953) && ctrl->vs[3]) {

               unsigned int max_hw_sectors;



               dev->stripe_size = 1<< (ctrl->vs[3] + shift);

               max_hw_sectors =dev->stripe_size >> (shift - 9);

               if (dev->max_hw_sectors) {

                        dev->max_hw_sectors= min(max_hw_sectors,

                                                             dev->max_hw_sectors);

               } else

                        dev->max_hw_sectors= max_hw_sectors;

     }



     dev->tagset.ops = &nvme_mq_ops;

     dev->tagset.nr_hw_queues =dev->online_queues - 1;

     dev->tagset.timeout =NVME_IO_TIMEOUT;

     dev->tagset.numa_node =dev_to_node(&dev->pci_dev->dev);

     dev->tagset.queue_depth =

                                 min_t(int,dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;

     dev->tagset.cmd_size = sizeof(structnvme_cmd_info);

     dev->tagset.flags =BLK_MQ_F_SHOULD_MERGE;

     dev->tagset.driver_data = dev;



     if(blk_mq_alloc_tag_set(&dev->tagset))//設置tagset,為io queue用

               goto out;



     id_ns = mem;

     for (i = 1; i <= nn; i++) {

               res = nvme_identify(dev, i,0, dma_addr);//發送identify cmd

               if (res)

                        continue;



               if (id_ns->ncap == 0)

                        continue;



               res = nvme_get_features(dev,NVME_FEAT_LBA_RANGE, i,

                                                             dma_addr+ 4096, NULL);

               if (res)

                        memset(mem + 4096,0, 4096);



               ns = nvme_alloc_ns(dev, i,mem, mem + 4096);//為每個namespace 分配資源

               if (ns)

                        list_add_tail(&ns->list,&dev->namespaces);

     }

     list_for_each_entry(ns,&dev->namespaces, list)

               add_disk(ns->disk);//注冊分區,這里注冊namespce更恰當,在user layer表現為一個分區

     res = 0;

out:

     dma_free_coherent(&dev->pci_dev->dev,8192, mem, dma_addr);

     return res;

}

static structnvme_ns *nvme_alloc_ns(structnvme_dev *dev, unsigned nsid,

                        struct nvme_id_ns*id, struct nvme_lba_range_type *rt)

{

     struct nvme_ns *ns;

     struct gendisk *disk;

     int node =dev_to_node(&dev->pci_dev->dev);

     int lbaf;



     if (rt->attributes &NVME_LBART_ATTRIB_HIDE)

               return NULL;



     ns = kzalloc_node(sizeof(*ns),GFP_KERNEL, node);

     if (!ns)

               return NULL;

     ns->queue =blk_mq_init_queue(&dev->tagset);//初始化request queue

     if (IS_ERR(ns->queue))

               goto out_free_ns;

     queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES,ns->queue);

     queue_flag_set_unlocked(QUEUE_FLAG_NONROT,ns->queue);

     queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS,ns->queue);

     ns->dev = dev;

     ns->queue->queuedata = ns;



     disk = alloc_disk_node(0, node);//分配disk 資源

     if (!disk)

               goto out_free_queue;



     ns->ns_id = nsid;

     ns->disk = disk;

     lbaf = id->flbas & 0xf;

     ns->lba_shift =id->lbaf[lbaf].ds;

     ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);

     blk_queue_logical_block_size(ns->queue,1 << ns->lba_shift);

     if (dev->max_hw_sectors)

               blk_queue_max_hw_sectors(ns->queue,dev->max_hw_sectors);

     if (dev->stripe_size)

               blk_queue_chunk_sectors(ns->queue,dev->stripe_size >> 9);

     if (dev->vwc &NVME_CTRL_VWC_PRESENT)

               blk_queue_flush(ns->queue,REQ_FLUSH | REQ_FUA);



     disk->major = nvme_major;

     disk->first_minor = 0;

     disk->fops = &nvme_fops;

     disk->private_data = ns;

     disk->queue = ns->queue;

     disk->driverfs_dev =&dev->pci_dev->dev;

     disk->flags = GENHD_FL_EXT_DEVT;

     sprintf(disk->disk_name,"nvme%dn%d", dev->instance, nsid);

     set_capacity(disk,le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));



     if (dev->oncs &NVME_CTRL_ONCS_DSM)

               nvme_config_discard(ns);



     return ns;

out_free_queue:

     blk_cleanup_queue(ns->queue);

out_free_ns:

     kfree(ns);

     return NULL;

}

到此,整個nvme 初始化過程分析完畢。。。。。。


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM