上篇文章主要從硬件的角度分析了PCI設備的特性以及各種寄存器,那么本節就結合LInux源代碼分析下內核中PCI設備的各種數據結構以及相互之間的聯系和工作機制
2016-10-09
注:一下代碼參考LInux3.11.1內核
基本的數據結構:
struct pci_bus
struct pci_bus { struct list_head node; /* node in list of buses */ struct pci_bus *parent; /* parent bus this bridge is on */ struct list_head children; /* list of child buses */ struct list_head devices; /* list of devices on this bus */ struct pci_dev *self; /* bridge device as seen by parent */ struct list_head slots; /* list of slots on this bus */ struct resource *resource[PCI_BRIDGE_RESOURCE_NUM]; struct list_head resources; /* address space routed to this bus */ struct resource busn_res; /* bus numbers routed to this bus */ struct pci_ops *ops; /* configuration access functions */ void *sysdata; /* hook for sys-specific extension */ struct proc_dir_entry *procdir; /* directory entry in /proc/bus/pci */ unsigned char number; /* bus number */ unsigned char primary; /* number of primary bridge */ unsigned char max_bus_speed; /* enum pci_bus_speed */ unsigned char cur_bus_speed; /* enum pci_bus_speed */ char name[48]; unsigned short bridge_ctl; /* manage NO_ISA/FBB/et al behaviors */ pci_bus_flags_t bus_flags; /* Inherited by child busses */ struct device *bridge; struct device dev; struct bin_attribute *legacy_io; /* legacy I/O for this bus */ struct bin_attribute *legacy_mem; /* legacy mem */ unsigned int is_added:1; };
每個PCI總線都有一個pci_bus結構與之對應。
系統中所有的根總線的pci_bus結構通過node連接起來;
parent指向該總線的父總線即上一級總線;
children描述這條PCI總線的子總線鏈表的表頭;
devices描述了這條PCI總線的邏輯設備鏈表的表頭;
self指向引出這條PCI總線的橋設備的pci_dev結構;
ops指向一個結構描述訪問配置空間的方法;
number描述總線號;
primary描述橋設備所在總線;
struct dev
struct pci_dev { struct list_head bus_list; /* node in per-bus list對應總線下所有的設備鏈表 */ struct pci_bus *bus; /* bus this device is on 設備所在的bus*/ struct pci_bus *subordinate; /* bus this device bridges to 當作為PCI橋設備時,指向下級bus */ void *sysdata; /* hook for sys-specific extension */ struct proc_dir_entry *procent; /* device entry in /proc/bus/pci */ struct pci_slot *slot; /* Physical slot this device is in */ unsigned int devfn; /* encoded device & function index 設備號和功能號*/ unsigned short vendor; unsigned short device; unsigned short subsystem_vendor; unsigned short subsystem_device; unsigned int class; /* 3 bytes: (base,sub,prog-if) */ u8 revision; /* PCI revision, low byte of class word */ u8 hdr_type; /* PCI header type (`multi' flag masked out) */ u8 pcie_cap; /* PCI-E capability offset */ u8 msi_cap; /* MSI capability offset */ u8 msix_cap; /* MSI-X capability offset */ u8 pcie_mpss:3; /* PCI-E Max Payload Size Supported */ u8 rom_base_reg; /* which config register controls the ROM */ u8 pin; /* which interrupt pin this device uses */ u16 pcie_flags_reg; /* cached PCI-E Capabilities Register */ struct pci_driver *driver; /* which driver has allocated this device */ u64 dma_mask; /* Mask of the bits of bus address this device implements. Normally this is 0xffffffff. You only need to change this if your device has broken DMA or supports 64-bit transfers. */ struct device_dma_parameters dma_parms; pci_power_t current_state; /* Current operating state. In ACPI-speak, this is D0-D3, D0 being fully functional, and D3 being off. */ u8 pm_cap; /* PM capability offset */ unsigned int pme_support:5; /* Bitmask of states from which PME# can be generated */ unsigned int pme_interrupt:1; unsigned int pme_poll:1; /* Poll device's PME status bit */ unsigned int d1_support:1; /* Low power state D1 is supported */ unsigned int d2_support:1; /* Low power state D2 is supported */ unsigned int no_d1d2:1; /* D1 and D2 are forbidden */ unsigned int no_d3cold:1; /* D3cold is forbidden */ unsigned int d3cold_allowed:1; /* D3cold is allowed by user */ unsigned int mmio_always_on:1; /* disallow turning off io/mem decoding during bar sizing */ unsigned int wakeup_prepared:1; unsigned int runtime_d3cold:1; /* whether go through runtime D3cold, not set for devices powered on/off by the corresponding bridge */ unsigned int d3_delay; /* D3->D0 transition time in ms */ unsigned int d3cold_delay; /* D3cold->D0 transition time in ms */ #ifdef CONFIG_PCIEASPM struct pcie_link_state *link_state; /* ASPM link state. */ #endif pci_channel_state_t error_state; /* current connectivity state */ struct device dev; /* Generic device interface */ int cfg_size; /* Size of configuration space */ /* * Instead of touching interrupt line and base address registers * directly, use the values stored here. They might be different! */ unsigned int irq; struct resource resource[DEVICE_COUNT_RESOURCE]; /* I/O and memory regions + expansion ROMs */ bool match_driver; /* Skip attaching driver */ /* These fields are used by common fixups */ unsigned int transparent:1; /* Transparent PCI bridge */ unsigned int multifunction:1;/* Part of multi-function device */ /* keep track of device state */ unsigned int is_added:1; unsigned int is_busmaster:1; /* device is busmaster */ unsigned int no_msi:1; /* device may not use msi */ unsigned int block_cfg_access:1; /* config space access is blocked */ unsigned int broken_parity_status:1; /* Device generates false positive parity */ unsigned int irq_reroute_variant:2; /* device needs IRQ rerouting variant */ unsigned int msi_enabled:1; unsigned int msix_enabled:1; unsigned int ari_enabled:1; /* ARI forwarding */ unsigned int is_managed:1; unsigned int is_pcie:1; /* Obsolete. Will be removed. Use pci_is_pcie() instead */ unsigned int needs_freset:1; /* Dev requires fundamental reset */ unsigned int state_saved:1; unsigned int is_physfn:1; unsigned int is_virtfn:1; unsigned int reset_fn:1; unsigned int is_hotplug_bridge:1; unsigned int __aer_firmware_first_valid:1; unsigned int __aer_firmware_first:1; unsigned int broken_intx_masking:1; unsigned int io_window_1k:1; /* Intel P2P bridge 1K I/O windows */ pci_dev_flags_t dev_flags; atomic_t enable_cnt; /* pci_enable_device has been called */ u32 saved_config_space[16]; /* config space saved at suspend time */ struct hlist_head saved_cap_space; struct bin_attribute *rom_attr; /* attribute descriptor for sysfs ROM entry */ int rom_attr_enabled; /* has display of the rom attribute been enabled? */ struct bin_attribute *res_attr[DEVICE_COUNT_RESOURCE]; /* sysfs file for resources */ struct bin_attribute *res_attr_wc[DEVICE_COUNT_RESOURCE]; /* sysfs file for WC mapping of resources */ #ifdef CONFIG_PCI_MSI struct list_head msi_list; struct kset *msi_kset; #endif struct pci_vpd *vpd; #ifdef CONFIG_PCI_ATS union { struct pci_sriov *sriov; /* SR-IOV capability related */ struct pci_dev *physfn; /* the PF this VF is associated with */ }; struct pci_ats *ats; /* Address Translation Service */ #endif phys_addr_t rom; /* Physical address of ROM if it's not from the BAR */ size_t romlen; /* Length of ROM if it's not from the BAR */ };
PCI 總線上的每一個邏輯設備都會有一個這樣的結構。
struct resource
struct resource { resource_size_t start; resource_size_t end; const char *name; unsigned long flags; struct resource *parent, *sibling, *child; };
系統用resource結構管理地址空間資源,因為這里主要的目的是映射地址空間(PIO或者MMIO)到設備,所以地址空間可以說是一種資源,每段地址空間用一個resource結構來描述。該結構中有地址空間的起始和結束地址,name,和一些位;系統維護了兩個全局的resource鏈表,一個用於管理IO端口空間,一個用於管理IO內存空間。從resource結構可以看出,這里是通過樹來組織這些結構。這里有些類似於windows中虛擬內存管理方式,通過這種方式可以比較高效的分配一段未使用的區間或者查找一段區間是否有重疊!
start指向區間的起點
end指向區間的重點
name是區間類型名
flag記錄區間的一些標志位
parent指向父res,child指向第一個子res,同一父res的子res通過sibling連接起來,整體結構如下圖:
額~~~仔細一數算,涉及到的結構還真不多,那么就不在這耽擱時間了,看LInux內核中PCI設備探測和初始化的過程
PCI設備的探測有多種方式,大體上分為BIOS探測和直接探測。直接探測又分為兩種類型。一般而言只要是采用PCI總線的PC機,其BIOS都必須提供對PCI總線的支持,因而成為PCI BIOS。這種BIOS在機器加電之后的自檢階段會從系統中的第一個PCI橋即host-PCI 橋開始進行探測和掃描,逐個的枚舉連接在第一條PCI總線上的所有PCI設備並記錄。如果其中有個設備是PCI-PCI橋,則更進一步,在探測這個橋所連接總線上的設備,依次遞歸,知道窮盡所有的PCI設備。但是並不是所有的系統都有BIOS,后來LInux內核也就提供了一種直接探測PCI設備的方式,繞過了BIOS的探測。
PCI設備的初始化在內核中我們分為兩個入口:
1、arch_initcall(pci_arch_init);
2、subsys_initcall(pci_subsys_init);
arch_initcall的優先級要高於subsys_initcall,所以pci_arch_init函數會在函數pci_subsys_init之前執行。這里我們的分析也從兩個部分
1、pci_arch_init
static __init int pci_arch_init(void) { #ifdef CONFIG_PCI_DIRECT int type = 0; type = pci_direct_probe(); #endif if (!(pci_probe & PCI_PROBE_NOEARLY)) pci_mmcfg_early_init(); if (x86_init.pci.arch_init && !x86_init.pci.arch_init()) return 0; #ifdef CONFIG_PCI_BIOS pci_pcbios_init(); #endif /* * don't check for raw_pci_ops here because we want pcbios as last * fallback, yet it's needed to run first to set pcibios_last_bus * in case legacy PCI probing is used. otherwise detecting peer busses * fails. */ #ifdef CONFIG_PCI_DIRECT pci_direct_init(type); #endif if (!raw_pci_ops && !raw_pci_ext_ops) printk(KERN_ERR "PCI: Fatal: No config space access function found\n"); dmi_check_pciprobe(); dmi_check_skip_isa_align(); return 0; }
該函數完成的主要是與體系結構相關的。函數中根據不同的探索方式有不同的選項,這里我們只關注CONFIG_PCI_DIRECT ,可以看到這里僅僅是調用了一個pci_direct_probe函數
int __init pci_direct_probe(void) { if ((pci_probe & PCI_PROBE_CONF1) == 0) goto type2; /*在IO地址空間從0xCF8開始申請八個字節用於用於配置PCI*/ if (!request_region(0xCF8, 8, "PCI conf1")) goto type2; if (pci_check_type1()) { raw_pci_ops = &pci_direct_conf1; port_cf9_safe = true; return 1; } release_region(0xCF8, 8); type2: ... } release_region(0xC000, 0x1000); fail2: release_region(0xCF8, 4); return 0; }
該函數完成的功能比較簡單,就是從IO地址空間申請8個字節用作訪問PCI配置空間的IO端口0xCF8~0xCFF。前四個字節做地址端口,后四個字節做數據端口。
然后調用pci_check_type1函數進一步檢查,這主要是對1型配置方式做檢查:
1 static int __init pci_check_type1(void) 2 { 3 unsigned long flags; 4 unsigned int tmp; 5 int works = 0; 6 7 local_irq_save(flags); 8 9 outb(0x01, 0xCFB); 10 tmp = inl(0xCF8); 11 outl(0x80000000, 0xCF8); 12 if (inl(0xCF8) == 0x80000000 && pci_sanity_check(&pci_direct_conf1)) { 13 works = 1; 14 } 15 outl(tmp, 0xCF8); 16 local_irq_restore(flags); 17 18 return works; 19 }
這里代碼只有短短幾行,但是着實不容易理解,首先向0xCFB寫入了0x01,然后保存0xCF8 四個字節的內容,接着向地址端口0xCF8寫入0x800000000,然后讀數據端口。回想下前面提到的PCI總線地址的格式,最高位始終為1,那么這里就意味着要讀0總線0設備0功能0寄存器的值。但是下面貌似並沒有讀取數據端口,而是又讀了下地址端口。這我就不太明白了,難道這里只是檢測下端口是否正常??
接着就繼續調用了pci_sanity_check函數確保這種訪問模式可以正確執行。這里是測試HOST-bridge是否存在
1 static int __init pci_sanity_check(const struct pci_raw_ops *o) 2 { 3 u32 x = 0; 4 int year, devfn; 5 6 if (pci_probe & PCI_NO_CHECKS) 7 return 1; 8 /* Assume Type 1 works for newer systems. 9 This handles machines that don't have anything on PCI Bus 0. */ 10 dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL); 11 if (year >= 2001) 12 return 1; 13 14 for (devfn = 0; devfn < 0x100; devfn++) { 15 /*讀取設備的類型*/ 16 if (o->read(0, 0, devfn, PCI_CLASS_DEVICE, 2, &x)) 17 continue; 18 if (x == PCI_CLASS_BRIDGE_HOST || x == PCI_CLASS_DISPLAY_VGA) 19 return 1; 20 /*讀取設備廠商*/ 21 if (o->read(0, 0, devfn, PCI_VENDOR_ID, 2, &x)) 22 continue; 23 if (x == PCI_VENDOR_ID_INTEL || x == PCI_VENDOR_ID_COMPAQ) 24 return 1; 25 } 26 27 DBG(KERN_WARNING "PCI: Sanity check failed\n"); 28 return 0; 29 }
這樣prob階段的工作就完成了。回到pci_arch_init函數中,接下來調用了pci_direct_init函數,
void __init pci_direct_init(int type) { if (type == 0) return; printk(KERN_INFO "PCI: Using configuration type %d for base access\n", type); if (type == 1) { raw_pci_ops = &pci_direct_conf1; if (raw_pci_ext_ops) return; if (!(pci_probe & PCI_HAS_IO_ECS)) return; printk(KERN_INFO "PCI: Using configuration type 1 " "for extended access\n"); raw_pci_ext_ops = &pci_direct_conf1; return; } raw_pci_ops = &pci_direct_conf2; }
該函數就比較短小了,在前面pci_direct_probe函數的時候已經返回1,那么這里實際上就是設置raw_pci_ops=&pci_direct_config,從而設置正確的的讀取PCI設備配置空間的方法。到此為止,一階段已經結束了,看下面第二部分
2、pci_subsys_init
int __init pci_subsys_init(void) { /* * The init function returns an non zero value when * pci_legacy_init should be invoked. */ /*這pci.init可能會被初始化成兩個函數pci_legacy_init和pci_acpi_init 猜想這里是要確保pci_legacy_init得到執行*/ if (x86_init.pci.init()) pci_legacy_init(); pcibios_fixup_peer_bridges(); x86_init.pci.init_irq(); pcibios_init(); return 0; }
在Pci_x86.c文件中,有這么一段初始化的宏:
# ifdef CONFIG_ACPI # define x86_default_pci_init pci_acpi_init # else # define x86_default_pci_init pci_legacy_init
這里就是如果配置了ACPI的話,就初始化成pci_acpi_init,否則就初始化成pci_legacy_init,這里我們先只考慮沒有配置ACPI的情況。
int __init pci_legacy_init(void) { if (!raw_pci_ops) { printk("PCI: System does not support PCI\n"); return 0; } printk("PCI: Probing PCI hardware\n"); pcibios_scan_root(0); return 0; }
在第一部分,已經設置了raw_pci_ops,所以這里正常情況就略過,執行pcibios_scan_root
關鍵函數在於pcibios_scan_root,分析pci_find_next_bus的代碼就可以知道,在發現root總線之前,這里是返回NULL的,需要通過pci_scan_bus_on_node來探測root總線。
1 struct pci_bus *pcibios_scan_root(int busnum) 2 { 3 struct pci_bus *bus = NULL; 4 /*找到0號總線后,后續就比較容易了,這里直接遍歷總線鏈表,若已經存在對應總線號的總線,則說明已經探測到了,就直接返回*/ 5 while ((bus = pci_find_next_bus(bus)) != NULL) { 6 if (bus->number == busnum) { 7 /* Already scanned */ 8 return bus; 9 } 10 } 11 /*否則還需要根據總線號尋找總線並返回*/ 12 return pci_scan_bus_on_node(busnum, &pci_root_ops, 13 get_mp_bus_to_node(busnum)); 14 }
我們看下pci_scan_bus_on_node函數做了什么
1 struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node) 2 { 3 LIST_HEAD(resources); 4 struct pci_bus *bus = NULL; 5 struct pci_sysdata *sd; 6 7 /* 8 * Allocate per-root-bus (not per bus) arch-specific data. 9 * TODO: leak; this memory is never freed. 10 * It's arguable whether it's worth the trouble to care. 11 */ 12 sd = kzalloc(sizeof(*sd), GFP_KERNEL); 13 if (!sd) { 14 printk(KERN_ERR "PCI: OOM, skipping PCI bus %02x\n", busno); 15 return NULL; 16 } 17 sd->node = node; 18 /*給總線分配資源*/ 19 x86_pci_root_bus_resources(busno, &resources); 20 printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busno); 21 bus = pci_scan_root_bus(NULL, busno, ops, sd, &resources); 22 if (!bus) { 23 pci_free_resource_list(&resources); 24 kfree(sd); 25 } 26 return bus; 27 }
其實這里關於sysdata的問題,我也不是很清楚,貌似和CPU架構相關,這里我們不重點考慮,后續曉得了 再補充。我們可以看到這里調用了x86_pci_root_bus_resource函數給總線分配資源,這里需要說下就是一條總線上的設備的資源也是從總線的資源里面分配的,就是我們所說的窗口,設備或者橋的窗口一定是對應總線窗口的子窗口,如果可能后面會專門拿出一節分析資源的分配。
接着就調用pci_scan_root_bus函數探測根總線。
void x86_pci_root_bus_resources(int bus, struct list_head *resources) { struct pci_root_info *info = x86_find_pci_root_info(bus); struct pci_root_res *root_res; struct pci_host_bridge_window *window; bool found = false; if (!info) goto default_resources; printk(KERN_DEBUG "PCI: root bus %02x: hardware-probed resources\n", bus); /* already added by acpi ? */ /*resource 是記錄windows的鏈表*/ /*這里判斷是否已經分配了總線號資源*/ list_for_each_entry(window, resources, list) if (window->res->flags & IORESOURCE_BUS) { found = true; break; } /*如果沒有分配需要重新分配*/ if (!found) /*相當於向系統注冊總線號資源*/ pci_add_resource(resources, &info->busn); list_for_each_entry(root_res, &info->resources, list) { struct resource *res; struct resource *root; res = &root_res->res; /*向系統注冊地址空間資源*/ pci_add_resource(resources, res); if (res->flags & IORESOURCE_IO) root = &ioport_resource; else root = &iomem_resource; /*把res插入資源樹*/ insert_resource(root, res); } return; default_resources: /* * We don't have any host bridge aperture information from the * "native host bridge drivers," e.g., amd_bus or broadcom_bus, * so fall back to the defaults historically used by pci_create_bus(). */ printk(KERN_DEBUG "PCI: root bus %02x: using default resources\n", bus); pci_add_resource(resources, &ioport_resource); pci_add_resource(resources, &iomem_resource); }
這里首先判斷總線的總線號資源分配情況,因為總線號也是一種資源,且由於總線遍歷是按照深度優先遍歷,一條總線上的所有子總線可以看成一棵樹,並且這些總線號是連續的,所以可以用區間來表示,這就和IO資源、MEM資源類似了,所以這里總線號資源也是通過resource結構來表示的。一個host-bridge有一個全局的資源鏈表resources,連接了所有pci_host_bridge_window,總線號資源、IO端口資源、IO內存資源在里面都有體現.為什么這么說呢??因為不論是那種資源都都會有一個pci_host_bridge_window與之對應,注意是一個區間就有一個pci_host_bridge_window結構,我們還是看下這個結構:
1 struct pci_host_bridge_window { 2 struct list_head list; 3 struct resource *res; /* host bridge aperture (CPU address) */ 4 resource_size_t offset; /* bus address + offset = CPU address 記錄總線地址到存儲器地址的偏移*/ 5 };
list使結構作為節點連接在resources鏈表中,res指向其所對應的resource,而offset表示映射的偏移,該字段主要用在IO端口和IO內存和物理內存的映射,總線號資源這里默認是0.
回到x86_pci_root_bus_resource函數中。之前咱們看到初始化的時候x86.init有可能被初始化成acpi_init,如果是這個那么資源很可能就分配好了。所以這里判斷下。如果沒有的話就調用pci_add_resource函數添加資源。所完成的功能就是申請一個pci_host_bridge_window結構,指向對應的res結構,掛入全局resource鏈表。函數內容我們就不看了,要不就講不完了。
接着就注冊地址空間資源(IO端口和IO內存),這里基本分為兩步:
1、創建對應的windows,並掛入鏈表
2、把res插入到全局的資源樹中。
第一步就不在贅述,第二步其實就是從全局的資源中申請空間,類似於windows中VAD樹和LInux中的紅黑樹,但是又不同。這里樹的結構前面文章有提到,總之,插入之后表明該段空間已經被占用,在次分配就不能分配和本段沖突。
下面就該pci_scan_root_bus函數 ,這才是探測總線的重點!!