Linux源碼版本: 5.3.0
guest os中的kvmclock驅動
kvmclock_init()函數主要做了以下幾件事:
-
確定了各vcpu要使用的MSR
-
將各vcpu在kvmclock中實際使用的數據結構pvclock_vsyscall_time_info的物理地址利用write_msr寫到屬於每個vcpu的MSR
-
將1GHz的kvmclock作為clocksource注冊到系統clocksource中
void __init kvmclock_init(void)
{
u8 flags;
if (!kvm_para_available() || !kvmclock) // 若不支持kvmclock則直接return
return;
if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) { // 查詢是否支持新的kvmclock msr
msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
} else if (!kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { // 如果只支持舊的kvmclock msr,則直接return
return;
}
if (cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "kvmclock:setup_percpu",
kvmclock_setup_percpu, NULL) < 0) { // hp:hotplug, BP:bootstrap啟動 啟動熱插拔主核的動態准備
return;
}
pr_info("kvm-clock: Using msrs %x and %x",
msr_kvm_system_time, msr_kvm_wall_clock); // msr_kvm_system_time:kvm_system_time使用的msr
// msr_kvm_wall_clock:kvm_wall_clock使用的msr
this_cpu_write(hv_clock_per_cpu, &hv_clock_boot[0]); // 將hv_clock_boot數組第一個元素的地址(虛擬地址)賦值
// 給local cpu變量hv_clock_per_cpu
/* 將vcpu0的hv_clock_per_cpu物理地址寫入msr_kvm_system_time指向的msr中 */
kvm_register_clock("primary cpu clock");
pvclock_set_pvti_cpu0_va(hv_clock_boot); // 將hv_clock_boot的數組地址寫入pvti_cpu0_va中,pvti_cpu0_va是一個pvti類型的指針
if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) // KVM_FEATURE_CLOCKSOURCE_STABLE_BIT時鍾源穩定指示bit
pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); // 半虛擬化時鍾TSC是否穩定指示bit,什么作用呢?
flags = pvclock_read_flags(&hv_clock_boot[0].pvti); // 確定vcpu0的pvti結構的flags
kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT); // 進行時鍾調度初始化,sched_clock()用於時鍾調度、時間戳,及利用硬件計數器
// 的延時以提供一個精確的延遲時鍾源
/* 注冊各種回調函數 */
x86_platform.calibrate_tsc = kvm_get_tsc_khz; // tsc_khz為Host的pTSCfreq,指向host TSC頻率的指針
x86_platform.calibrate_cpu = kvm_get_tsc_khz;
x86_platform.get_wallclock = kvm_get_wallclock; // wallclock:獲得系統boot時的秒數和納秒數(絕對時間,自1970)
x86_platform.set_wallclock = kvm_set_wallclock;
#ifdef CONFIG_X86_LOCAL_APIC
x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock; // 注冊各非vcpu0的時鍾,將各vCPU0的vptihv_clock_per_cpu寫入各
// 自的msr_kvm_system_time指向的msr中
#endif
x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; // 保存sched_clock的狀態,事實什么都沒做
x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
machine_ops.shutdown = kvm_shutdown;
#ifdef CONFIG_KEXEC_CORE
machine_ops.crash_shutdown = kvm_crash_shutdown;
#endif
kvm_get_preset_lpj(); // lpj:loops_per_jiffy
/*
* X86_FEATURE_NONSTOP_TSC is TSC runs at constant rate
* with P/T states and does not stop in deep C-states.
*
* Invariant TSC exposed by host means kvmclock is not necessary:
* can use TSC as clocksource.
*
*/
if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
!check_tsc_unstable())
kvm_clock.rating = 299;
clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); // 1Ghz的kvmclock source的注冊
pv_info.name = "KVM";
}
注冊為x86_platform.xxxx的函數有3個,分別為kvm_get_tsc_khz,kvm_get/set_wallclock。
kvm_get_wallclock()
/*
* The wallclock is the time of day when we booted. Since then, some time may
* have elapsed since the hypervisor wrote the data. So we try to account for
* that with system time
*/
static void kvm_get_wallclock(struct timespec64 *now)
{
wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock)); // 將wall_clock(pvclock_wall_clock類型)對應的物理地址寫入對應msr
preempt_disable();
// wallclock的時間存於now
pvclock_read_wallclock(&wall_clock, this_cpu_pvti(), now); // 將host寫入的wallclock讀出來
preempt_enable();
}
/* wallclock的內容在該函數之前已經被host寫入
*
*
*/
void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
struct pvclock_vcpu_time_info *vcpu_time,
struct timespec64 *ts)
{
u32 version;
u64 delta;
struct timespec64 now;
/* get wallclock at system boot */
do {
version = wall_clock->version;
rmb(); /* fetch version before time */
/*
* Note: wall_clock->sec is a u32 value, so it can
* only store dates between 1970 and 2106. To allow
* times beyond that, we need to create a new hypercall
* interface with an extended pvclock_wall_clock structure
* like ARM has.
*/
now.tv_sec = wall_clock->sec; // 讀取wallclock時間,該時間不是完整時間
now.tv_nsec = wall_clock->nsec; // 還需要加上后面的delta, 即vm過去的時間
rmb(); /* fetch time before checking version */
} while ((wall_clock->version & 1) || (version != wall_clock->version));
delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */
delta += now.tv_sec * NSEC_PER_SEC + now.tv_nsec;
now.tv_nsec = do_div(delta, NSEC_PER_SEC);
now.tv_sec = delta;
set_normalized_timespec64(ts, now.tv_sec, now.tv_nsec);
}
struct pvclock_wall_clock {
u32 version;
u32 sec;
u32 nsec;
} __attribute__((__packed__));
kvm_get_tsc_khz()
// 以kHz為基礎獲得tsc count
static unsigned long kvm_get_tsc_khz(void)
{
setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
return pvclock_tsc_khz(this_cpu_pvti());
}
// 將pv_tsc_khz根據local cpu的pvti的tsc_shift和tsc_to_system_mul做校准
unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
{
u64 pv_tsc_khz = 1000000ULL << 32;
do_div(pv_tsc_khz, src->tsc_to_system_mul);
if (src->tsc_shift < 0)
pv_tsc_khz <<= -src->tsc_shift;
else
pv_tsc_khz >>= src->tsc_shift;
return pv_tsc_khz;
}
guest中設置時間的代碼框架
內核獲取wallclock
static struct pvclock_wall_clock wall_clock __bss_decrypted; // 靜態全局變量wall_clock,存儲於bss段
static void kvm_get_wallclock(struct timespec64 *now)
{
wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock)); // 將wall_clock的物理地址寫入到MSR_KVM_WALL_CLOCK, 觸發wrmsr_vmexit
preempt_disable();
pvclock_read_wallclock(&wall_clock, this_cpu_pvti(), now);
preempt_enable();
}
kvm_get_wallclock函數的第一個語句就會觸發wrmsr_vmexit, 進而經過一系列的調用:
handle_wrmsr=>kvm_set_msr=>kvm_x86_ops->set_msr=>vmx_set_msr=>kvm_set_msr_common
int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
u64 data = msr_info->data;
...
case MSR_KVM_WALL_CLOCK_NEW:
case MSR_KVM_WALL_CLOCK:
vcpu->kvm->arch.wall_clock = data;
kvm_write_wall_clock(vcpu->kvm, data);
break;
...
}
static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
{
...
// 這里的boot time為host系統啟動時間
getboottime64(&boot); // 獲取host的boot time並寫入boot變量
if (kvm->arch.kvmclock_offset) {//對host boot time做一些調整
struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
boot = timespec64_sub(boot, ts);
}
wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
wc.nsec = boot.tv_nsec;
wc.version = version;
// 更新guest的wallclock
kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); // 將wc即host boot time的內容寫入guest的wallclock中
...
}
kvm_write_wall_clock()函數執行完畢之后,guest可以在全局變量wall_clock中找到host系統的boot time.
內核更新wallclock
void getboottime64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot); // 做時間校准 tk->offs_real- tk->offs_boot
*ts = ktime_to_timespec64(t); // 轉換一下時間格式
}
重要語句為struct timekeeper *tk = &tk_core.timekeeper;
而tk_core的定義為:
/*
* The most important data for readout fits into a single 64 byte
* cache line.
*/
static struct {
seqcount_t seq;
struct timekeeper timekeeper;
} tk_core ____cacheline_aligned = {
.seq = SEQCNT_ZERO(tk_core.seq),
};
所以tk_core是一個結構體變量, 存儲於靜態區,且同時只能有一個cpu訪問該變量.
getboottime64()讀取tk_core.timekeeper的offs_real和offs_boot內容,那么tk_core.timekeeper的內容在哪里設置的呢?在內核代碼中找到了更新tk->offs_boot內容的代碼:
static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
{
tk->offs_boot = ktime_add(tk->offs_boot, delta); // tk->offs_boot += delta
/*
* Timespec representation for VDSO update to avoid 64bit division
* on every update.
* VDSO: 有些syscall使用很頻繁,但每次syscall都要進行從用戶態到內核態的切換,開銷很大,就將此類syscall的結果存儲在
* 一個共享區域中,每次syscall直接讀取結果即可,降低了開銷
* VDSO的全稱為: virtual dynamic share object
*/
tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot); // monotonic_to_boot是offs_boot的VDSO形式
// 是為了加速訪問offs_boot
}
tk_update_sleep_time()的核心語句為tk->offs_boot = ktime_add(tk->offs_boot, delta);
,delta是什么,從哪里來,需要在內核代碼中查找tk_update_sleep_time()的調用位置和參數意義.
static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
const struct timespec64 *delta)
{
...
// 設置tk中的CLOCK_REALTIME時間,並記錄誤差
tk_xtime_add(tk, delta); // 該函數內容為: tk->xtime_sec += delta->tv_sec;
// tk->tkr_mono.xtime_nsec += delta->tv_nsec << tk->tkr_mono.shift;
// xtime_sec: 以秒為單位的當前CLOCK_REALTIME時間
// tkr: timekeeping read, 用於讀出時間的結構體.
// tkr_mono.xtime_nsec 讀出時間肯定存在誤差, xtime_nsec是讀出時間的ns級誤差
...
tk_update_sleep_time(tk, timespec64_to_ktime(*delta));
...
}
在__timekeeping_inject_sleeptime()中還是看不到delta的取值,繼續找.
void timekeeping_resume(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
struct clocksource *clock = tk->tkr_mono.clock;
unsigned long flags;
struct timespec64 ts_new, ts_delta;
u64 cycle_now, nsec;
bool inject_sleeptime = false;
read_persistent_clock64(&ts_new); // 讀取新的wallclock到ts_new
clockevents_resume(); // 繼續時鍾事件
clocksource_resume(); // 繼續時鍾源
raw_spin_lock_irqsave(&timekeeper_lock, flags);
write_seqcount_begin(&tk_core.seq);
/*
* After system resumes, we need to calculate the suspended time and
* compensate it for the OS time. There are 3 sources that could be
* used: Nonstop clocksource during suspend, persistent clock and rtc
* device.
*
* One specific platform may have 1 or 2 or all of them, and the
* preference will be:
* suspend-nonstop clocksource -> persistent clock -> rtc
* The less preferred source will only be tried if there is no better
* usable source. The rtc part is handled separately in rtc core code.
*/
cycle_now = tk_clock_read(&tk->tkr_mono); // 獲得當前時間
nsec = clocksource_stop_suspend_timing(clock, cycle_now); // 獲得suspend的總時間
if (nsec > 0) {
ts_delta = ns_to_timespec64(nsec);
inject_sleeptime = true;
} else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) {
ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time);
inject_sleeptime = true;
}
if (inject_sleeptime) {
suspend_timing_needed = false;
__timekeeping_inject_sleeptime(tk, &ts_delta); // 原來delta指的是suspend的時間
}
}
我們一路追溯的delta原來是suspend的時間,畫出回溯圖:
內核初始化wallclock
現在我們知道,內核如何獲取wallclock,是靠x86_platform.get_wallclock,我們也知道,內核如何更新wallclock,是在系統suspend之后,resume之前,利用__timekeeping_inject_sleeptime()修改tk_core的內容,修改了wallclock.
但是,系統初始化時,wallclock肯定就被設置了,那么wallclock是如何被初始化的呢?猜測在timekeeping_init相關的函數中.
void __init timekeeping_init(void)
{
...
read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); // 讀取walltime和bootOffset到這倆變量
...
}
void __weak __init
read_persistent_wall_and_boot_offset(struct timespec64 *wall_time,
struct timespec64 *boot_offset)
{
read_persistent_clock64(wall_time); // 讀取walltime
*boot_offset = ns_to_timespec64(local_clock());
}
/* not static: needed by APM */
void read_persistent_clock64(struct timespec64 *ts)
{
x86_platform.get_wallclock(ts);
}
是不是很熟悉,x86_platform.get_wallclock(ts);在kvm中,x86_platform.get_wallclock = kvm_get_wallclock,在host上,x86_platform.get_wallclock = vrtc_get_time.
void vrtc_get_time(struct timespec64 *now)
{
u8 sec, min, hour, mday, mon;
unsigned long flags;
u32 year;
spin_lock_irqsave(&rtc_lock, flags);
while ((vrtc_cmos_read(RTC_FREQ_SELECT) & RTC_UIP))
cpu_relax();
sec = vrtc_cmos_read(RTC_SECONDS);
min = vrtc_cmos_read(RTC_MINUTES);
hour = vrtc_cmos_read(RTC_HOURS);
mday = vrtc_cmos_read(RTC_DAY_OF_MONTH);
mon = vrtc_cmos_read(RTC_MONTH);
year = vrtc_cmos_read(RTC_YEAR);
spin_unlock_irqrestore(&rtc_lock, flags);
/* vRTC YEAR reg contains the offset to 1972 */
year += 1972;
pr_info("vRTC: sec: %d min: %d hour: %d day: %d "
"mon: %d year: %d\n", sec, min, hour, mday, mon, year);
now->tv_sec = mktime64(year, mon, mday, hour, min, sec);
now->tv_nsec = 0;
}
可以看到,walltime的值包含了年月日時分秒,讀取自rtc_cmos時鍾中.
那么,結論就來了,guest的kvmclock的wallclock來自於RTC時鍾, 且該wallclock由所有vcpu共享,如果vcpu想獲得wallclock,就得寫屬於自己的msr_wall_clock.每當wallclock的內容更新,所有vcpu都能讀到最新wallclock,而不是只有寫msr_wall_clock的那個vcpu可以讀到.
systemTime的初始化
從kvmclock驅動角度來看,在kvmclock_init()中,,就將vcpu0和其余vcpu的vpti結構的物理地址,通過write msr寫到了各自的system_time_msr中.
static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
#define HVC_BOOT_ARRAY_SIZE \
(PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info)) // 表示一個page能放多少個pvti結構
static struct pvclock_vsyscall_time_info
hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE); // pvti結構數組
void __init kvmclock_init(void)
{
...
// 獲得msr_system_time和msr_wall_clock
msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
...
this_cpu_write(hv_clock_per_cpu, &hv_clock_boot[0]); // 將pvti結構數組中的第一個元素地址給hv_clock_per_cpu
kvm_register_clock("primary cpu clock"); // 將hv_clock_per_cpu的物理地址寫入對應system_time_msr
pvclock_set_pvti_cpu0_va(hv_clock_boot); // pvti_cpu0_va = hv_clock_boot, 將hv_clock_boot地址作為cpu0的pvti地址
#ifdef CONFIG_X86_LOCAL_APIC
x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock; // 在smp_init中,調用kvm_register_clock初始化除cpu0以外的cpu時鍾, 也因此將各自的hv_clock_per_cpu的物理地址傳入了對應的system_time_msr
#endif
...
clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);// 注冊1GHz的kvmclock作為一個clocksource
}
static void kvm_setup_secondary_clock(void)
{
kvm_register_clock("secondary cpu clock");
}
static void kvm_register_clock(char *txt)
{
struct pvclock_vsyscall_time_info *src = this_cpu_hvclock();
u64 pa;
if (!src)
return;
pa = slow_virt_to_phys(&src->pvti) | 0x01ULL; // 確保pvti結構的bit0為1
wrmsrl(msr_kvm_system_time, pa); // 將該cpu的pvti結構的物理地址通過寫msr的方式寫入對應msr_kvm_system_time
pr_info("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt);
}
現在我們知道了,在kvmclock_init()中,會用寫msr的方式,將各cpu的pvti結構的物理地址寫入各自對應的system_time_msr, 這里要追溯兩條線索:
一條線索向上,找出在何時調用kvmclock_init()進而將cpu0的pvti結構的物理地址寫入對應msr,以及何時調用x86_cpuinit.early_percpu_clock_init,將其余cpu的pvti結構的物理地址寫入對應msr.
另一條線索向下,當guest kernel中發生寫msr時,會導致wrmsr_vmexit,研究在該vmexit中,會怎樣處理對應msr.
可以看到,在guest啟動內核時就調用了kvmclock_init(),將vcpu0的pvti結構的物理地址寫入了對應msr, 並注冊了將其余vcpu的pvti結構的物理地址寫入對應msr的回調函數kvm_setup_secondary_clock.
接下來看何時調用x86_cpuinit.early_percpu_clock_init.
至此,所有vcpu的pvti的物理地址寫入msr路徑已經搞清楚,接下來看另一條線索,即當寫msr動作發生時,觸發vmexit,在handle_wrmsr中如何處理system_time. 與wallclock類似,也經歷了以下調用過程.
handle_wrmsr=>kvm_set_msr=>kvm_x86_ops->set_msr=>vmx_set_msr=>kvm_set_msr_common
int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
...
case MSR_KVM_SYSTEM_TIME_NEW:
case MSR_KVM_SYSTEM_TIME: {
struct kvm_arch *ka = &vcpu->kvm->arch;
kvmclock_reset(vcpu); // 將該vcpu的pv_time_enabled標志置為false
// 如果是vcpu0, 那么就將tmp設置為表示是否使用的舊的kvmclock
if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) { // host_initiated在handle_wrmsr()中會被置false
bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
// 如果沒有使用舊的kvmclock,則發出KVM_REQ_MASTERCLOCK_UPDATE請求
if (ka->boot_vcpu_runs_old_kvmclock != tmp)
kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
ka->boot_vcpu_runs_old_kvmclock = tmp;
}
// 將該vcpu的pvti的物理地址值賦值給該vpcu的arch.time,並發出KVM_REQ_GLOBAL_CLOCK_UPDATE請求
vcpu->arch.time = data;
kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
/* we verify if the enable bit is set... */
if (!(data & 1)) // 確保pvti的bit0不為0,如果為0,將不使用kvmclock
break;
if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
&vcpu->arch.pv_time, data & ~1ULL,
sizeof(struct pvclock_vcpu_time_info)))
vcpu->arch.pv_time_enabled = false;
else
vcpu->arch.pv_time_enabled = true;
break;
}
}
即,如果運行的是vcpu0,且是否使用舊的kvmclock msr與當前的boot_vcpu_runs_old_kvmclock標志不一致,那么一定是出了一些什么問題,需要校准MASTERCLOCK,發出KVM_REQ_MASTERCLOCK_UPDATE請求.然后進行普通vcpu的操作.
普通vcpu的操作: 如果運行的是其它vcpu,那么只需要將該vcpu的pvti的物理地址值賦值給該vcpu的arch.time,並發出KVM_REQ_GLOBAL_CLOCK_UPDATE請求(也就是說,vcpu0有可能連續發出兩個REQUEST).之后根據kvm_gfn_to_hva_cache_init的結果將pv_time_enabled置為true或false.看一下kvm_gfn_to_hva_cache_init函數.
int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
gpa_t gpa, unsigned long len)
{
struct kvm_memslots *slots = kvm_memslots(kvm); //為vcpu->arch.pv_time從memory中分配一些slots
return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len); // 將vcpu->arch.pv_time的cache初始化,賦值,檢查有效性
}
static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
struct gfn_to_hva_cache *ghc,
gpa_t gpa, unsigned long len)
{
int offset = offset_in_page(gpa); // 獲取pvti的物理地址在page中的offset
gfn_t start_gfn = gpa >> PAGE_SHIFT; // 獲取pvti的物理地址的起始guest frame number
gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; // 獲取pvti的物理地址的終止guest frame number
gfn_t nr_pages_needed = end_gfn - start_gfn + 1; // 所需的page數量
gfn_t nr_pages_avail;
int r = start_gfn <= end_gfn ? 0 : -EINVAL; // 正常情況下r==0
ghc->gpa = gpa;
ghc->generation = slots->generation; // slots的代數,用於分辨存儲的內容為第幾代
ghc->len = len;
ghc->hva = KVM_HVA_ERR_BAD;
/*
* If the requested region crosses two memslots, we still
* verify that the entire region is valid here.
*/
while (!r && start_gfn <= end_gfn) { // 確保申請的slots有效
ghc->memslot = __gfn_to_memslot(slots, start_gfn);
ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
&nr_pages_avail); // 為存放pvti的緩存分配對應的host 虛擬地址
if (kvm_is_error_hva(ghc->hva))
r = -EFAULT;
start_gfn += nr_pages_avail;
}
/* Use the slow path for cross page reads and writes. */
if (!r && nr_pages_needed == 1)
ghc->hva += offset;
else
ghc->memslot = NULL;
return r; // 正常情況下return0
}
可以看出,kvm_gfn_to_hva_cache_init(vcpu->kvm,
&vcpu->arch.pv_time, data & ~1ULL,
sizeof(struct pvclock_vcpu_time_info)) 為vcpu->arch.pv_time申請了cache空間(對應host的虛擬地址),將pvti的物理地址寫入了該cache的gpa字段.
所以,在分配host虛擬地址成功的情況下,vcpu->arch.pv_time_enabled肯定為true.
綜上,在各vcpu啟動后,將各vcpu的pvti結構的物理地址寫入msr_system_time_i,並開辟緩存空間,用於打通host和guest.
各vcpu的pvti結構只需要一次wrmsr便可與host虛擬地址關聯,之后無需wrmsr,host不定期寫入pvti的最新值.
guest從pvti結構讀取system time
guest從pvti結構讀取system time的觸發點為上面提到的3種request:
KVM_REQ_MASTERCLOCK_UPDATE
KVM_REQ_GLOBAL_CLOCK_UPDATE
KVM_REQ_CLOCK_UPDATE
那么, guest kernel中什么時候發出這3種REQUEST呢?逐個來看.
三大更新時間請求的觸發點
KVM_REQ_MASTERCLOCK_UPDATE
- 當masterclock被使能,就一直發出KVM_REQ_MASTERCLOCK_UPDATE請求,以更新masterclock. 這樣情況的代碼在kvm_track_tsc_matching中.
masterclock何時可以被使能:
- host clocksource必須為tsc
- vcpus必須有matched tsc,即vcpus的v_tsc必須與host_tsc頻率一直
調用路徑一共有2條:
第一條為:(由底層函數向頂層函數追溯)
kvm_track_tsc_matching => kvm_write_tsc => kvm_set_msr_common寫MSR_IA32_TSC
即在guest os運行過程中,如果出現kvm_set_msr_common(MSR_IA32_TSC), 且滿足masterclock使能條件,且masterclock使能,則發出KVM_REQ_MASTERCLOCK_UPDATE請求
第二條為:(由底層函數向頂層函數追溯)
kvm_track_tsc_matching => kvm_write_tsc => kvm_arch_vcpu_postcreate => kvm_vm_ioctl_create_vcpu
即在創建vcpu時,滿足masterclock使能條件,且masterclock使能,則發出KVM_REQ_MASTERCLOCK_UPDATE請求.
-
寫MSR_KVM_SYSTEM_TIME/MSR_KVM_SYSTEM_TIME_NEW時,如果使用的是新版kvmclock,即寫的是MSR_KVM_SYSTEM_TIME_NEW, 則發出KVM_REQ_MASTERCLOCK_UPDATE.這是systemTime的初始化期間的一段.
-
在pvclock_gtod_update_fn中,對所有vcpu發出了KVM_REQ_MASTERCLOCK_UPDATE.而pvclock_gtod_update_fn的調用路徑為:
static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
/*
* Notification about pvclock gtod data update.
*/
static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
void *priv)
{
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
struct timekeeper *tk = priv;
update_pvclock_gtod(tk);
/* disable master clock if host does not trust, or does not
* use, TSC based clocksource.
*/
if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
atomic_read(&kvm_guest_has_master_clock) != 0) // guest clocksource從TSC變為了非TSC時
queue_work(system_long_wq, &pvclock_gtod_work); // 將pvclock_gtod_work放入工作隊列
return 0;
}
static struct notifier_block pvclock_gtod_notifier = {
.notifier_call = pvclock_gtod_notify,
};
int kvm_arch_init(void *opaque)
{
pvclock_gtod_register_notifier(&pvclock_gtod_notifier); // 將pvclock_gtod_notifier注冊為一個時間更新listener,每當host更新時間, 就會調用pvclock_gtod_notifier進而調用pvclock_gtod_notify
}
即當host更新時間,且kvm發現guest的clocksource從TSC變為非TSC時,發出KVM_REQ_MASTERCLOCK_UPDATE請求.
4.在kvm_arch_hardware_enable中,發現guest tsc發生了倒退,那么向所有vcpu發出KVM_REQ_MASTERCLOCK_UPDATE請求.
KVM_REQ_GLOBAL_CLOCK_UPDATE
-
在kvmclock驅動初始化時,kvmclock_init()中的kvm_register_clock觸發wrmsr進而調用kvm_set_msr_common寫MSR_KVM_SYSTEM_TIME/MSR_KVM_SYSTEM_TIME_NEW, 發出KVM_REQ_GLOBAL_CLOCK_UPDATE請求
-
在做從vcpu到pcpu(物理cpu)的遷移時,如果guest的tsc不一致,則需要發KVM_REQ_GLOBAL_CLOCK_UPDATE請求.
KVM_REQ_CLOCK_UPDATE
-
kvm_gen_update_masterclock中,對所有vcpu發出KVM_REQ_CLOCK_UPDATE請求.而kvm_gen_update_masterclock為KVM_REQ_MASTERCLOCK_UPDATE請求的handler.
-
在kvmclock_update_fn函數中對所有vcpu發出KVM_REQ_CLOCK_UPDATE請求,kvmclock_update_fn的調用順序為:
kvm_arch_init_vm() { // 初始化延時作業, 將kvmclock_update_fn注冊為kvm->arch.kvmclock_update_work的回調函數 INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); // 初始化延時作業,將kvmclock_sync_fn注冊為kvm->arch.kvmclock_sync_work的回調函數 INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); } static void kvmclock_sync_fn(struct work_struct *work) { // 立即調用kvmclock_update_work->kvmclock_update_fn schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); schedule_delayed_work(&kvm->arch.kvmclock_sync_work, KVMCLOCK_SYNC_PERIOD); // 300 s后重新調用kvmclock_sync_work->kvmclock_sync_fn } static void kvmclock_update_fn(struct work_struct *work) { kvm_for_each_vcpu(i, vcpu, kvm) { kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); // kvm_guest_time_update() kvm_vcpu_kick(vcpu); // kick a vcpu to sleep. or make a guest mode vcpu into host kernel mode. } }
即在kvmclock的同步函數中定義了立即作業(更新kvmclock),和延時作業(同步kvmclock).也就是說,kvm第一次調用同步kvmclock函數后,每300s更新和同步一次kvmclock,每次更新kvmclock時都發出KVM_REQ_CLOCK_UPDATE請求.
-
kvm_gen_kvmclock_update中,對當前vcpu發出KVM_REQ_CLOCK_UPDATE請求,100ms后調用更新kvmclock函數kvmclock_update_fn,后者對所有vcpu發出KVM_REQ_CLOCK_UPDATE請求.kvm_gen_kvmclock_update是KVM_REQ_GLOBAL_CLOCK_UPDATE請求的handler.
static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) { struct kvm *kvm = v->kvm; kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); // 立即發送KVM_REQ_CLOCK_UPDATE請求 schedule_delayed_work(&kvm->arch.kvmclock_update_work, KVMCLOCK_UPDATE_DELAY); // 100ms后觸發kvmclock_update_fn }
-
kvm_arch_vcpu_load中,如果檢測到了外部tsc_offset_adjustment,就發出KVM_REQ_CLOCK_UPDATE請求.即在切換到特定vcpu時,做檢測並決定是否發出KVM_REQ_CLOCK_UPDATE請求.
-
kvm_set_guest_paused中,會發出KVM_REQ_CLOCK_UPDATE請求,kvm_set_guest_paused告訴guest kernel,該guest kernel已經被kvm停止了.即在guest kernel pause時,發出KVM_REQ_CLOCK_UPDATE請求.
-
在qemu發出KVM_SET_CLOCK的ioctl時,向所有vcpu發出KVM_REQ_CLOCK_UPDATE請求.qemu設置時鍾時,更新guest時鍾是理所應當的事情.
-
在__kvmclock_cpufreq_notifier中,對所有vcpu發出了KVM_REQ_CLOCK_UPDATE.因為該函數為cpu頻率變化時的回調函數,當host cpu頻率變化時,應該重新設置guest的時間.
-
在vmexit時,如果guest的tsc總是追上host的tsc,說明guest的tsc頻率高於host的tsc頻率,需要重新校准guest的時間.因此向當前vcpu發出KVM_REQ_CLOCK_UPDATE.
-
kvm_arch_hardware_enable,如果host tsc不穩定,就對所有vcpu發出KVM_REQ_CLOCK_UPDATE請求.而kvm_arch_hardware_enable的調用路徑為:
kvm_arch_hardware_enable => hardware_enable_nolock => kvm_starting_cpu
=> kvm_resume
也就是說,在kvm啟動vcpu和恢復vcpu的運行時,都需要發出KVM_REQ_CLOCK_UPDATE以調整時間.
三大請求的處理
在確定了各更新時間的請求的triger點之后,接下來看一下這些請求的handler究竟針對請求做了哪些處理.
3種請求均在vcpu_enter_guest(),即進入non-root之前做處理.
KVM_REQ_MASTERCLOCK_UPDATE
static void kvm_gen_update_masterclock(struct kvm *kvm)
{
#ifdef CONFIG_X86_64
int i;
struct kvm_vcpu *vcpu;
struct kvm_arch *ka = &kvm->arch;
spin_lock(&ka->pvclock_gtod_sync_lock);
kvm_make_mclock_inprogress_request(kvm); // 發出KVM_REQ_MCLOCK_INPROGRESS請求,讓所有vcpu無法進入guest
/* no guest entries from this point */
pvclock_update_vm_gtod_copy(kvm);//確認guest能否使用master_clock(用於vcpu之間的時間同步)
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); // 向所有vcpu發出KVM_REQ_CLOCK_UPDATE請求
/* guest entries allowed */
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); // 清除KVM_REQ_MCLOCK_INPROGRESS請求,讓所有vcpu進入guest
spin_unlock(&ka->pvclock_gtod_sync_lock);
#endif
}
static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
{
#ifdef CONFIG_X86_64
struct kvm_arch *ka = &kvm->arch;
int vclock_mode;
bool host_tsc_clocksource, vcpus_matched;
vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
atomic_read(&kvm->online_vcpus)); // 表示vcpus的tsc頻率是否match
/*
* If the host uses TSC clock, then passthrough TSC as stable
* to the guest.
*/
host_tsc_clocksource = kvm_get_time_and_clockread(
&ka->master_kernel_ns,
&ka->master_cycle_now); // 如果host使用tsc,host_tsc_clocksource為true
// master_kernel_ns為master中記錄的host boot以來的時間
// master_cycle_now為master中記錄的host的當前tsc取值
ka->use_master_clock = host_tsc_clocksource && vcpus_matched
&& !ka->backwards_tsc_observed
&& !ka->boot_vcpu_runs_old_kvmclock; // backwards_tsc_observed表示是否觀察到tsc倒退現象
// boot_vcpu_runs_old_kvmclock表示kvmclock使用舊的MSR
if (ka->use_master_clock)
atomic_set(&kvm_guest_has_master_clock, 1); // 如果use_master_clock為1,就將kvm_guest_has_master_clock設為1
vclock_mode = pvclock_gtod_data.clock.vclock_mode;
trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
vcpus_matched);
#endif
}
可以看到,對於KVM_REQ_MASTERCLOCK_UPDATE請求,kvm做了兩件事情,一件事情是確認guest能否使用master_clock(用於vcpu之間的時間同步),另一件事情是對所有vcpu發出了更基本的請求,即KVM_REQ_CLOCK_UPDATE請求(在KVM_REQ_CLOCK_UPDATE的處理中說明).
KVM_REQ_GLOBAL_CLOCK_UPDATE
static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
{
struct kvm *kvm = v->kvm;
kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); // 立即發送KVM_REQ_CLOCK_UPDATE請求
schedule_delayed_work(&kvm->arch.kvmclock_update_work,
KVMCLOCK_UPDATE_DELAY); // 100ms后觸發kvmclock_update_fn
}
對於KVM_REQ_GLOBAL_CLOCK_UPDATE請求, kvm首先對當前vcpu發送了更基本請求,即KVM_REQ_CLOCK_UPDATE請求,在發出請求后100ms,調用kvmclock_update_fn,kvmclock_update_fn的作用是對所有vcpu發出KVM_REQ_CLOCK_UPDATE請求.
也就是說,KVM_REQ_GLOBAL_CLOCK_UPDATE的處理為:
- 向當前vcpu發送KVM_REQ_CLOCK_UPDATE請求
- 向所有vcpu發送KVM_REQ_CLOCK_UPDATE請求,並kick所有vcpu.
KVM_REQ_CLOCK_UPDATE
從上面的兩種請求的處理可以看到,上面兩種請求都以來基礎請求KVM_REQ_CLOCK_UPDATE,因此KVM_REQ_CLOCK_UPDATE的處理非常重要.
static int kvm_guest_time_update(struct kvm_vcpu *v)
{
unsigned long flags, tgt_tsc_khz;
struct kvm_vcpu_arch *vcpu = &v->arch;
struct kvm_arch *ka = &v->kvm->arch;
s64 kernel_ns;
u64 tsc_timestamp, host_tsc;
u8 pvclock_flags;
bool use_master_clock;
kernel_ns = 0;
host_tsc = 0;
/*
* If the host uses TSC clock, then passthrough TSC as stable
* to the guest.
*/
spin_lock(&ka->pvclock_gtod_sync_lock);
use_master_clock = ka->use_master_clock;
if (use_master_clock) { // 如果host使用tsc clock,直接將tsc傳遞給guest即可
host_tsc = ka->master_cycle_now; // 將master的cycle_now記為host_tsc (tsc數)
kernel_ns = ka->master_kernel_ns; // 將master的kernel_ns記為kernel_ns (master的kernel 納秒數) host boot以來的時間
}
spin_unlock(&ka->pvclock_gtod_sync_lock);
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz); // 讀取當前vcpu的tsc值
if (unlikely(tgt_tsc_khz == 0)) { // 如果當前vcpu的tsc值無效,則發出KVM_REQ_CLOCK_UPDATE請求
local_irq_restore(flags);
kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
return 1;
}
if (!use_master_clock) {//如果host不使用tsc clock,
host_tsc = rdtsc(); // 通過手動讀取tsc的值獲得host的tsc值
kernel_ns = ktime_get_boottime_ns(); // 獲得host kernel boot以來的時間
}
tsc_timestamp = kvm_read_l1_tsc(v, host_tsc); // 將host tsc的值通過scale和offset得到當前時間戳
/*
* We may have to catch up the TSC to match elapsed wall clock
* time for two reasons, even if kvmclock is used.
* 1) CPU could have been running below the maximum TSC rate
* 2) Broken TSC compensation resets the base at each VCPU
* entry to avoid unknown leaps of TSC even when running
* again on the same CPU. This may cause apparent elapsed
* time to disappear, and the guest to stand still or run
* very slowly.
*/
if (vcpu->tsc_catchup) {
u64 tsc = compute_guest_tsc(v, kernel_ns); // 通過host boot以來的時間,計算理論guest tsc
if (tsc > tsc_timestamp) { // 如果理論guest tsc比經過調整的host tsc大
adjust_tsc_offset_guest(v, tsc - tsc_timestamp);//將offset調整為原offset+tsc-tsc_timestap
tsc_timestamp = tsc; // 將tsc賦值給當前時間戳,使其保持一致
}
}
local_irq_restore(flags);
/* With all the info we got, fill in the values */
if (kvm_has_tsc_control) // 如果支持tsc scaling
tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz); // 將當前vcpu的tsc值做scale
if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) { // 如果當前vcpu的tsc與vcpu記錄的硬件tsc值不同,則調整tsc scale值
kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
&vcpu->hv_clock.tsc_shift,
&vcpu->hv_clock.tsc_to_system_mul);
vcpu->hw_tsc_khz = tgt_tsc_khz;
}
// 向pvti類型的hv_clock賦值
vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
vcpu->last_guest_tsc = tsc_timestamp;
/* If the host uses TSC clocksource, then it is stable */
pvclock_flags = 0;
if (use_master_clock)
pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
vcpu->hv_clock.flags = pvclock_flags;
if (vcpu->pv_time_enabled) // 如果使用半虛擬化(kvmclock)
kvm_setup_pvclock_page(v); // 就將pv_clock的內容拷貝到pv_clock
if (v == kvm_get_vcpu(v->kvm, 0))
kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
return 0;
}
kvm_guest_time_update()做了以下幾件事情:
- 獲取host的tsc value和host kernel boot以來的ns數
- 讀取當前vcpu的tsc value
- 經過一系列的校准,將最終時間賦值給vcpu->hv_clock
- 如果vcpu使能了半虛擬化,就調用kvm_setup_pvclock_page
來看kvm_setup_pvclock_page.
static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
{
...
kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
&vcpu->hv_clock,
sizeof(vcpu->hv_clock)); // 將hv_clock的內容賦值到pv_time中去
...
}
這里的pv_time就是之前我們提到的每個vcpu都有1個的pvti結構.將將hv_clock的內容賦值到pv_time中去,即將最新時間更新到vcpu的pvti結構中去.
system time就這樣在pvti結構中被更新了.
host對system time的寫入
host對system time的寫入一般來說有2種情況,同步寫入和異步寫入.
同步寫入指的是周期性更新guest中system time的值,以和host時間保持一致.
異步寫入指的是在特殊事件發生時(如guest suspend時),更新guest中system time的值,防止guest中的時間出錯.
host對system time的同步寫入
kvm通過pvclock_gtod_register_notifier
向timekeeper層注冊了一個回調pvclock_gtod_notify
(在上面的三大請求trigger點的介紹中有提到),每當Host Kernel時鍾更新時(即timekeeping_update
被調用時),就會調用pvclock_gtod_notify
.
static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
void *priv)
{
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
struct timekeeper *tk = priv;
update_pvclock_gtod(tk); // 更新pvclock的內容
/* disable master clock if host does not trust, or does not
* use, TSC based clocksource.
*/
if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
atomic_read(&kvm_guest_has_master_clock) != 0) // clocksource從TSC變為了非TSC時
queue_work(system_long_wq, &pvclock_gtod_work);
return 0;
}
static struct pvclock_gtod_data pvclock_gtod_data;
static void update_pvclock_gtod(struct timekeeper *tk)
{
struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
u64 boot_ns;
boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
write_seqcount_begin(&vdata->seq);
/* copy pvclock gtod data */
vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
vdata->clock.cycle_last = tk->tkr_mono.cycle_last; //host時間更新時的clocksource的counter讀數
vdata->clock.mask = tk->tkr_mono.mask;
vdata->clock.mult = tk->tkr_mono.mult;
vdata->clock.shift = tk->tkr_mono.shift;
vdata->boot_ns = boot_ns; // host時間更新時的s部分,以ns記
vdata->nsec_base = tk->tkr_mono.xtime_nsec; // host時間更新時的wallclock的ns部分
vdata->wall_time_sec = tk->xtime_sec;// host時間更新時的wallclock的s部分
write_seqcount_end(&vdata->seq);
}
pvclock_gtod_notify()完成了2件事情:
- 調用update_pvclock_gtod更新了pvclock_gtod_data
- 檢測host的clocksource是否變為了非tsc,如果變了則將作業pvclock_gtod_work入隊
static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
static void pvclock_gtod_update_fn(struct work_struct *work)
{
struct kvm *kvm;
struct kvm_vcpu *vcpu;
int i;
mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
atomic_set(&kvm_guest_has_master_clock, 0);
mutex_unlock(&kvm_lock);
}
可以看到pvclock_gtod_work的實際函數pvclock_gtod_update_fn的作用為:
向所有vcpu發出KVM_REQ_MASTERCLOCK_UPDATE,而后者經過層層調用,更新每個vcpu的pvti結構中的時間數據.
也就是說,每當Host Kernel時鍾更新時,如果使用master_clock,kvm會更新每個vcpu的pvti時間.內核的代碼中使用tk_clock_read讀取clocksource當前counter,但是沒有發現上下文中有對讀取時間的cpu的限制.
host對system time的異步寫入
host對system time的異步寫入通過qemu實現,利用kvm_vm_ioctl(KVM_SET_CLOCK),與kvm發生交互.
而kvm中,KVM_SET_CLOCK的ioctl的定義如下:
case KVM_SET_CLOCK: {
struct kvm_clock_data user_ns;
u64 now_ns;
r = -EFAULT;
if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
goto out;
r = -EINVAL;
if (user_ns.flags)
goto out;
r = 0;
/*
* TODO: userspace has to take care of races with VCPU_RUN, so
* kvm_gen_update_masterclock() can be cut down to locked
* pvclock_update_vm_gtod_copy().
*/
kvm_gen_update_masterclock(kvm); // 確認guest能否使用masterclock,並向所有vcpu發出時間更新請求
now_ns = get_kvmclock_ns(kvm); // 讀取當前cpu的時間
kvm->arch.kvmclock_offset += user_ns.clock - now_ns; // 確認當前cpu和qemu傳入的cpu時間的offset
kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE); // 利用新的offset對所有vcpu的時間進行更新
break;
}
可以看到,kvm_vm_ioctl(KVM_SET_CLOCK)做了以下幾件事情:
- 確認guest能否使用masterclock,並向所有vcpu發出時間更新請求
- 讀取當前cpu的時間(根據是否使用masterclock,讀取時間的方式不同)
- 計算當前cpu和qemu傳入的cpu時間的offset
- 利用新的offset對所有vcpu的時間進行更新
host對system time的異步寫入依賴qemu和kvm的交互kvm_vm_ioctl(KVM_SET_CLOCK).
masterclock: 由於我們的kvmclock依賴於Host Boot Time和Host TSC兩個量,即使Host TSC同步且Guest TSC同步,在pCPU0和pCPU1分別取兩者,前者的差值和后者的差值也可能不相等,並且誰大誰小都有可能,從而可能違反kvmclock的單調性。因此,我們通過只使用一份Master Copy,即Master Clock,來解決這個問題。
---update on 6.1 2020-----
由於對pvclock_gtod_data和各vcpu的pvti結構的更新之間的關系不太清楚,特此研究記錄.
與各vcpu的pvti結構對應的host虛擬地址的申請
在kvmclock驅動初始化時,kvmclock_init()中的kvm_register_clock觸發wrmsr進而調用kvm_set_msr_common寫MSR_KVM_SYSTEM_TIME/MSR_KVM_SYSTEM_TIME_NEW.
在kvm_set_msr_common()中最關鍵的一句話為:
if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
&vcpu->arch.pv_time, data & ~1ULL,
sizeof(struct pvclock_vcpu_time_info))) // 計算出寫入gpa為data時,對應的hva.
// 將hva,gpa,內存區域長度,對應的memslot地址,該memslot對應的generation都存入arch.pv_time中
vcpu->arch.pv_time_enabled = false;
else
vcpu->arch.pv_time_enabled = true;
kvm_gfn_to_hva_cache_init的函數原型為:
int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
gpa_t gpa, unsigned long len);
其中,ghc為gfn_to_hva_cache結構體類型,意義為將guest frame number轉化成host virtual address的cache.定義為:
struct gfn_to_hva_cache {
u64 generation; // cache的代數
gpa_t gpa; // guest物理地址
unsigned long hva; // host虛擬地址
unsigned long len; //該cache的大小
struct kvm_memory_slot *memslot; // 該cache對應的kvm memslot地址
};
kvm_gfn_to_hva_cache_init()函數的實現為:
int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
gpa_t gpa, unsigned long len)
{
struct kvm_memslots *slots = kvm_memslots(kvm); // slots為整個kvm memslots的地址
return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
}
static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
struct gfn_to_hva_cache *ghc,
gpa_t gpa, unsigned long len)
{
int offset = offset_in_page(gpa); // 頁內offset
/* 雖然gpa只是一個地址,但由於從gpa開始,需要len長度的空間,所以存在起始gfn和終點gfn */
gfn_t start_gfn = gpa >> PAGE_SHIFT; // gpa對應的起始gfn(guest frame number)
gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; // gpa對應的終點gfn
gfn_t nr_pages_needed = end_gfn - start_gfn + 1; // 所需頁的數量
gfn_t nr_pages_avail;
int r = start_gfn <= end_gfn ? 0 : -EINVAL; // r判斷起始gfn和終點gfn的有效性
/* 將guest物理地址,memslot的代數,所需存儲空間的長度,賦值給gfn_to_hva_cache結構 */
ghc->gpa = gpa;
ghc->generation = slots->generation;
ghc->len = len;
ghc->hva = KVM_HVA_ERR_BAD; // 先將host虛擬地址賦值為無效
//-----------------------------下面開始為gpa找對應的hva,存儲到賦值給gfn_to_hva_cache結構中去
/*
* If the requested region crosses two memslots, we still
* verify that the entire region is valid here.
*/
while (!r && start_gfn <= end_gfn) { // 如果請求的空間大小橫跨2個memslot,需要確定請求空間的有效性
ghc->memslot = __gfn_to_memslot(slots, start_gfn);
ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
&nr_pages_avail);
if (kvm_is_error_hva(ghc->hva))
r = -EFAULT;
start_gfn += nr_pages_avail;
}
/* Use the slow path for cross page reads and writes. */
if (!r && nr_pages_needed == 1)
ghc->hva += offset;
else
ghc->memslot = NULL;
return r;
}
從上面的分析可以看出,以下這段語句的作用為:從kvm_memslots中申請大小為pvclock_vcpu_time_info結構大小的緩存空間,該緩存空間緩存的是物理地址為data中所存地址中指向的內容,該緩存空間對應的host虛擬地址為hva.
kvm_gfn_to_hva_cache_init(vcpu->kvm,
&vcpu->arch.pv_time, data & ~1ULL,
sizeof(struct pvclock_vcpu_time_info))
在kvmclock驅動初始化寫MSR_KVM_SYSTEM_TIME/MSR_KVM_SYSTEM_TIME_NEW時導致的kvm_set_msr_common()中,data就是每個vcpu都有的pvti結構.讓pvti有了一個host虛擬地址.
向各vcpu的pvti結構對應的host虛擬地址寫入時間
先不論時間從哪里來,肯定會向pvti結構寫入時間.
static int kvm_guest_time_update(struct kvm_vcpu *v)
{
...
if (vcpu->pv_time_enabled) // 在kvm_set_msr_common中建立pvti的gpa對應的hva時就使能了
kvm_setup_pvclock_page(v);
}
static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
{
...
kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
&vcpu->hv_clock,
sizeof(vcpu->hv_clock)); // 將hv_clock(pvti結構體類型)寫到vcpu的gfn_to_hva_cache結構的hva去,即寫到了vcpu的pvti中去
}
int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
void *data, unsigned long len)
{
return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
}
int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
void *data, unsigned int offset,
unsigned long len)
{
struct kvm_memslots *slots = kvm_memslots(kvm);
int r;
gpa_t gpa = ghc->gpa + offset;
BUG_ON(len + offset > ghc->len);
if (slots->generation != ghc->generation)
__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
if (unlikely(!ghc->memslot))
return kvm_write_guest(kvm, gpa, data, len);
if (kvm_is_error_hva(ghc->hva))
return -EFAULT;
r = __copy_to_user((void __user *)ghc->hva + offset, data, len); // 將數據寫入gfn_to_hva_cache的hva地址中去
if (r)
return -EFAULT;
mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT);
return 0;
}
綜上,下面這段語句的意義為:
將vcpu的hv_clock(pvti結構類型)數據寫入屬於該vcpu的pvti對應的host虛擬地址中去.
kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
&vcpu->hv_clock,
sizeof(vcpu->hv_clock));
pv_clock_gtod_data
pv_clock_gtod_data是一個全局變量, kvm會在每個host的tick時更新該變量的內容.
struct pvclock_gtod_data {
seqcount_t seq;
struct { /* extract of a clocksource struct */
int vclock_mode;
u64 cycle_last;
u64 mask;
u32 mult;
u32 shift;
} clock;
u64 boot_ns;
u64 nsec_base;
u64 wall_time_sec;
};
如何更新的呢?
kvm通過pvclock_gtod_register_notifier
向timekeeper層注冊了一個回調pvclock_gtod_notify
,每當Host Kernel時鍾更新時(即timekeeping_update
被調用時),就會調用pvclock_gtod_notify
,進而調用update_pvclock_gtod更新pvclock_gtod_data的值.
static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
void *priv)
{
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
struct timekeeper *tk = priv;
update_pvclock_gtod(tk); // 更新pvclock_gtod_data的內容
/* disable master clock if host does not trust, or does not
* use, TSC based clocksource.
*/
if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
atomic_read(&kvm_guest_has_master_clock) != 0) // clocksource從TSC變為了非TSC時
queue_work(system_long_wq, &pvclock_gtod_work);
return 0;
}
static struct pvclock_gtod_data pvclock_gtod_data;
static void update_pvclock_gtod(struct timekeeper *tk)
{
struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
u64 boot_ns;
boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
write_seqcount_begin(&vdata->seq);
/* copy pvclock gtod data */
vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; // 時鍾源類型
vdata->clock.cycle_last = tk->tkr_mono.cycle_last; //host時間更新時的clocksource的counter讀數
vdata->clock.mask = tk->tkr_mono.mask;
vdata->clock.mult = tk->tkr_mono.mult;
vdata->clock.shift = tk->tkr_mono.shift;
vdata = boot_ns; // host時間更新時,用來獲得當前時間的基礎時間,以ns記
vdata->nsec_base = tk->tkr_mono.xtime_nsec; // host時間更新時的wallclock的ns部分
vdata->wall_time_sec = tk->xtime_sec;// host時間更新時的wallclock的s部分
write_seqcount_end(&vdata->seq);
}
pvti結構的數據來源vcpu->hv_clock
目前已知:
- kvm在kvm_guest_time_update()中更新各vcpu的pvti結構時,是將當前時間賦值給該函數中的hv_clock,然后將hv_clock的內容寫入到pvti結構中去.
- kvm會在host的每個tick,即每次host更新時間時,將時間相關變量更新到全局變量pvclock_gtod_data中
由此推斷,hv_clock肯定跟pvclock_gtod_data有一定的關系.下面尋找他們之間的聯系.
首先,在kvm_guest_time_update()中會檢查是否使用master_clock即use_master_clock的值,根據該bool值的取值,當前時間的獲取方式也不同.
kvm中何時決定use_master_clock的值?暫時不做討論,這里只需要知道use_master_clock為1時,kvm只使用一份host tsc和guest tsc,其它vcpu復制之.
use_master_clock為True
如果use_master_clock為真,則讓:
host_tsc = ka->master_cycle_now;
kernel_ns = ka->master_kernel_ns;
那么 ka->master_cycle_now和ka->master_kernel_ns的意義是什么,何時被賦值的?
在pvclock_update_vm_gtod_copy()中,有對master_cycle_now和master_kernel_ns的賦值,以kvm_get_time_and_clockread()
形式呈現,kvm_get_time_and_clockread
通過do_monotonic_boot()和pvclock_gtod_data中的值來獲得master_kernel_ns的值,意義為自host boot 以來的ns數.
而do_monotonic_boot通過vgettsc=>read_tsc,獲得master_cycle_now的值,read_tsc通過對比rdtsc指令和pv_clock_gtod_data->clock.cycle_last的返回值,確定tsc value是否后退,如果后退,則返回pv_clock_gtod_data->clock.cycle_last的值,即上一次讀取tsc時的值,如果沒有后退,則返回rdtsc指令的結果.總之,master_cycle_now代表當前PCPU上的沒有后退的tsc值.
// kvm更新master_kernel_ns和master_cycle_now的函數
static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
{
...
host_tsc_clocksource = kvm_get_time_and_clockread(
&ka->master_kernel_ns,
&ka->master_cycle_now);
}
static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
{
...
return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns,
tsc_timestamp)); // 用do_monotonic_boot獲得master_kernel_ns,master_cycle_now.
}
static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp)
{
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
unsigned long seq;
int mode;
u64 ns;
do {
seq = read_seqcount_begin(>od->seq);
ns = gtod->nsec_base;
ns += vgettsc(tsc_timestamp, &mode);
ns >>= gtod->clock.shift;
ns += gtod->boot_ns;
} while (unlikely(read_seqcount_retry(>od->seq, seq)));
*t = ns;
return mode;
}
// 因為我們的host基本都用tsc,所以走的case是VCLOCK_TSC
static inline u64 vgettsc(u64 *tsc_timestamp, int *mode)
{
...
case VCLOCK_TSC:
*mode = VCLOCK_TSC;
*tsc_timestamp = read_tsc();
v = (*tsc_timestamp - gtod->clock.cycle_last) &
gtod->clock.mask;
break;
}
static u64 read_tsc(void)
{
u64 ret = (u64)rdtsc_ordered();
u64 last = pvclock_gtod_data.clock.cycle_last;
if (likely(ret >= last))
return ret;
/*
* GCC likes to generate cmov here, but this branch is extremely
* predictable (it's just a function of time and the likely is
* very likely) and there's a data dependence, so force GCC
* to generate a branch instead. I don't barrier() because
* we don't actually need a barrier, and if this function
* ever gets inlined it will generate worse code.
*/
asm volatile ("");
return last;
}
pvclock_update_vm_gtod_copy()只在2個kvm代碼的2個地方有引用,1是kvm_arch_init_vm(),2是kvm_gen_update_masterclock().后者在2個地方有引用,1是kvm_arch_vm_ioctl(KVM_SET_CLOCK),2是在每次vcpu_enter_guest()時檢查到KVM_REQ_MASTERCLOCK_UPDATE請求時.
也就是說,ka->master_kernel_ns和ka->master_cycle_now會在kvm運行的3個地方更新:
- 初始化虛擬機(guest)時
- 每次進入non-root mode檢測到KVM_REQ_MASTERCLOCK_UPDATE請求時
- 在userspace(如qemu)主動發起更新時間的請求時
結論: 如果使用master_clock, host_tsc表示當前PCPU上的無回退的TSC值, kenel_ns表示自host啟動以來的ns數.也只有use_master_clock為真時,kvm維護的pvclock_gtod_data的內容才會起作用.
use_master_clock為False
如果use_master_clock為假,則讓:
host_tsc = rdtsc();
kernel_ns = ktime_get_boottime_ns();
rdtsc()會直接讀取當前PCPU的TSC值
ktime_get_boottime_ns()獲取自host boot以來的ns數(利用的是host kernel中的timekeeping結構).
結論: 如果不使用master_clock, host_tsc表示當前PCPU(不保證是否回退)的TSC值,kernel_ns表示自host啟動以來的ns數.
TSC校准系數的調整及pvti cache的最終賦值
在獲得host_tsc和kernel_ns后,利用kvm_read_l1_tsc獲得arch層面的tsc_offset和tsc_scale, 並利用這二者對host_tsc進行調整,賦值給tsc_timestamp,那么tsc_timestamp的意義就非常明顯了,即"本次計時的TSC時間戳".
tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
之后將根據vcpu->tsc_catchup的取值,決定是否對arch層面的tsc_offset和tsc_scale進行調整.如果需要調整,根據上面的host_tsc計算此時tsc的理論值是多少,如果理論值比讀到的tsc值大,說明guest的tsc_offset和tsc_scale已經無法正確調整host_tsc的值了,需要修正.進而利用理論tsc和當前tsc進行的差值修正guest的tsc_offset和tsc_scale,並將理論tsc值賦值給本次計時的TSC時間戳.
if (vcpu->tsc_catchup) {
u64 tsc = compute_guest_tsc(v, kernel_ns);
if (tsc > tsc_timestamp) {
adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
tsc_timestamp = tsc;
}
}
接下來做了:
- 如果支持TSC_SCALLING Feature, 就利用該feature調整本地vcpu的目標TSC頻率.
if (kvm_has_tsc_control)
tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
- 如果本地vcpu的tsc頻率與目標tsc頻率不同,則重新調整hv_clock的shift和multi系數,以確保本地vcpu的tsc頻率與目標tsc頻率相等.
if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
&vcpu->hv_clock.tsc_shift,
&vcpu->hv_clock.tsc_to_system_mul);
vcpu->hw_tsc_khz = tgt_tsc_khz;
}
之后將本次計時的TSC時間戳,正確的system time(在實際使用時要加上wallclock時間才是標准時間)賦值給hv_clock結構,並將本次計時的時間戳保存在該vcpu的last_guest_tsc變量中.
vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
vcpu->last_guest_tsc = tsc_timestamp;
然后在kvm_setup_pvclock_page()中將hv_clock中的值更新到pv_time cache中去,該cache在kvmclock_init => WRITE MSR => handle_wrmsr => kvm_set_msr_common => kvm_gfn_to_hva_cache_init 流程中被分配空間及host虛擬地址,每個vcpu都有一個pv_time cache,其中的gpa指向每個vcpu的pvti結構.
if (vcpu->pv_time_enabled)
kvm_setup_pvclock_page(v); // 將hv_clock中的值賦值到pv_time cache中
以上, 對pvclock_gtod_data和各vcpu的pvti結構的更新之間的關系梳理基本完成.
基本思路遵循:
- 建立pvti結構的cache
- 在每次kvm更新時間時更新cache中的內容