kvmclock代碼學習


Linux源碼版本: 5.3.0

guest os中的kvmclock驅動

kvmclock_init()函數主要做了以下幾件事:

  1. 確定了各vcpu要使用的MSR

  2. 將各vcpu在kvmclock中實際使用的數據結構pvclock_vsyscall_time_info的物理地址利用write_msr寫到屬於每個vcpu的MSR

  3. 將1GHz的kvmclock作為clocksource注冊到系統clocksource中

void __init kvmclock_init(void)
{
	u8 flags;

	if (!kvm_para_available() || !kvmclock) // 若不支持kvmclock則直接return
		return;

	if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) { // 查詢是否支持新的kvmclock msr
		msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
		msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
	} else if (!kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { // 如果只支持舊的kvmclock msr,則直接return
		return;
	}

	if (cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "kvmclock:setup_percpu",
			      kvmclock_setup_percpu, NULL) < 0) { // hp:hotplug, BP:bootstrap啟動 啟動熱插拔主核的動態准備
		return;
	}

	pr_info("kvm-clock: Using msrs %x and %x",
		msr_kvm_system_time, msr_kvm_wall_clock); // msr_kvm_system_time:kvm_system_time使用的msr
												  // msr_kvm_wall_clock:kvm_wall_clock使用的msr

	this_cpu_write(hv_clock_per_cpu, &hv_clock_boot[0]); // 將hv_clock_boot數組第一個元素的地址(虛擬地址)賦值
														 //	給local cpu變量hv_clock_per_cpu
	/* 將vcpu0的hv_clock_per_cpu物理地址寫入msr_kvm_system_time指向的msr中 */
	kvm_register_clock("primary cpu clock");
	pvclock_set_pvti_cpu0_va(hv_clock_boot); // 將hv_clock_boot的數組地址寫入pvti_cpu0_va中,pvti_cpu0_va是一個pvti類型的指針

	if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) // KVM_FEATURE_CLOCKSOURCE_STABLE_BIT時鍾源穩定指示bit
		pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); // 半虛擬化時鍾TSC是否穩定指示bit,什么作用呢?

	flags = pvclock_read_flags(&hv_clock_boot[0].pvti); // 確定vcpu0的pvti結構的flags
	kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT); // 進行時鍾調度初始化,sched_clock()用於時鍾調度、時間戳,及利用硬件計數器
														  // 的延時以提供一個精確的延遲時鍾源

	/* 注冊各種回調函數 */
	x86_platform.calibrate_tsc = kvm_get_tsc_khz; // tsc_khz為Host的pTSCfreq,指向host TSC頻率的指針
	x86_platform.calibrate_cpu = kvm_get_tsc_khz;
	x86_platform.get_wallclock = kvm_get_wallclock; // wallclock:獲得系統boot時的秒數和納秒數(絕對時間,自1970)
	x86_platform.set_wallclock = kvm_set_wallclock;
#ifdef CONFIG_X86_LOCAL_APIC
	x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock; // 注冊各非vcpu0的時鍾,將各vCPU0的vptihv_clock_per_cpu寫入各
																	 // 自的msr_kvm_system_time指向的msr中
#endif
	x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; // 保存sched_clock的狀態,事實什么都沒做
	x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
	machine_ops.shutdown  = kvm_shutdown;
#ifdef CONFIG_KEXEC_CORE
	machine_ops.crash_shutdown  = kvm_crash_shutdown;
#endif
	kvm_get_preset_lpj(); // lpj:loops_per_jiffy

	/*
	 * X86_FEATURE_NONSTOP_TSC is TSC runs at constant rate
	 * with P/T states and does not stop in deep C-states.
	 *
	 * Invariant TSC exposed by host means kvmclock is not necessary:
	 * can use TSC as clocksource.
	 *
	 */
	if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
	    boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
	    !check_tsc_unstable())
		kvm_clock.rating = 299;

	clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); // 1Ghz的kvmclock source的注冊
	pv_info.name = "KVM";
}

注冊為x86_platform.xxxx的函數有3個,分別為kvm_get_tsc_khz,kvm_get/set_wallclock。

kvm_get_wallclock()

/*
 * The wallclock is the time of day when we booted. Since then, some time may
 * have elapsed since the hypervisor wrote the data. So we try to account for
 * that with system time
 */
static void kvm_get_wallclock(struct timespec64 *now)
{
	wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock)); // 將wall_clock(pvclock_wall_clock類型)對應的物理地址寫入對應msr
	preempt_disable();
    // wallclock的時間存於now
	pvclock_read_wallclock(&wall_clock, this_cpu_pvti(), now); // 將host寫入的wallclock讀出來
	preempt_enable();
}

/* wallclock的內容在該函數之前已經被host寫入
 *  
 * 
 */
void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
			    struct pvclock_vcpu_time_info *vcpu_time,
			    struct timespec64 *ts)
{
	u32 version;
	u64 delta;
	struct timespec64 now;

	/* get wallclock at system boot */
	do {
		version = wall_clock->version;
		rmb();		/* fetch version before time */
		/*
		 * Note: wall_clock->sec is a u32 value, so it can
		 * only store dates between 1970 and 2106. To allow
		 * times beyond that, we need to create a new hypercall
		 * interface with an extended pvclock_wall_clock structure
		 * like ARM has.
		 */
		now.tv_sec  = wall_clock->sec; // 讀取wallclock時間,該時間不是完整時間
		now.tv_nsec = wall_clock->nsec; // 還需要加上后面的delta, 即vm過去的時間
		rmb();		/* fetch time before checking version */
	} while ((wall_clock->version & 1) || (version != wall_clock->version));

	delta = pvclock_clocksource_read(vcpu_time);	/* time since system boot */
	delta += now.tv_sec * NSEC_PER_SEC + now.tv_nsec;

	now.tv_nsec = do_div(delta, NSEC_PER_SEC);
	now.tv_sec = delta;

	set_normalized_timespec64(ts, now.tv_sec, now.tv_nsec);
}



struct pvclock_wall_clock {
	u32   version;
	u32   sec;
	u32   nsec;
} __attribute__((__packed__));

kvm_get_tsc_khz()

// 以kHz為基礎獲得tsc count
static unsigned long kvm_get_tsc_khz(void)
{
	setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
	return pvclock_tsc_khz(this_cpu_pvti());
}

// 將pv_tsc_khz根據local cpu的pvti的tsc_shift和tsc_to_system_mul做校准
unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
{
	u64 pv_tsc_khz = 1000000ULL << 32;

	do_div(pv_tsc_khz, src->tsc_to_system_mul);
	if (src->tsc_shift < 0)
		pv_tsc_khz <<= -src->tsc_shift;
	else
		pv_tsc_khz >>= src->tsc_shift;
	return pv_tsc_khz;
}

guest中設置時間的代碼框架

內核獲取wallclock

static struct pvclock_wall_clock wall_clock __bss_decrypted; // 靜態全局變量wall_clock,存儲於bss段

static void kvm_get_wallclock(struct timespec64 *now)
{
	wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock)); // 將wall_clock的物理地址寫入到MSR_KVM_WALL_CLOCK, 觸發wrmsr_vmexit
	preempt_disable();
	pvclock_read_wallclock(&wall_clock, this_cpu_pvti(), now);
	preempt_enable();
}

kvm_get_wallclock函數的第一個語句就會觸發wrmsr_vmexit, 進而經過一系列的調用:

handle_wrmsr=>kvm_set_msr=>kvm_x86_ops->set_msr=>vmx_set_msr=>kvm_set_msr_common

int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
    u64 data = msr_info->data;
    ...
        case MSR_KVM_WALL_CLOCK_NEW:
		case MSR_KVM_WALL_CLOCK:
			vcpu->kvm->arch.wall_clock = data;
			kvm_write_wall_clock(vcpu->kvm, data);
			break;
    ...
}
static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
{
    ...
        // 這里的boot time為host系統啟動時間
        getboottime64(&boot); // 獲取host的boot time並寫入boot變量
    	if (kvm->arch.kvmclock_offset) {//對host boot time做一些調整
			struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
			boot = timespec64_sub(boot, ts);
	} 
		wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
		wc.nsec = boot.tv_nsec;
		wc.version = version;
		 // 更新guest的wallclock
		kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); // 將wc即host boot time的內容寫入guest的wallclock中
    ...
}

kvm_write_wall_clock()函數執行完畢之后,guest可以在全局變量wall_clock中找到host系統的boot time.

內核更新wallclock

void getboottime64(struct timespec64 *ts)
{
	struct timekeeper *tk = &tk_core.timekeeper;
	ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot); // 做時間校准 tk->offs_real- tk->offs_boot

	*ts = ktime_to_timespec64(t); // 轉換一下時間格式
}

重要語句為struct timekeeper *tk = &tk_core.timekeeper;而tk_core的定義為:

/*
 * The most important data for readout fits into a single 64 byte
 * cache line.
 */
static struct {
	seqcount_t		seq;
	struct timekeeper	timekeeper;
} tk_core ____cacheline_aligned = {
	.seq = SEQCNT_ZERO(tk_core.seq),
};

所以tk_core是一個結構體變量, 存儲於靜態區,且同時只能有一個cpu訪問該變量.

getboottime64()讀取tk_core.timekeeper的offs_real和offs_boot內容,那么tk_core.timekeeper的內容在哪里設置的呢?在內核代碼中找到了更新tk->offs_boot內容的代碼:

static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
{
	tk->offs_boot = ktime_add(tk->offs_boot, delta); // tk->offs_boot += delta
	/*
	 * Timespec representation for VDSO update to avoid 64bit division
	 * on every update.
	 * VDSO: 有些syscall使用很頻繁,但每次syscall都要進行從用戶態到內核態的切換,開銷很大,就將此類syscall的結果存儲在
	 * 一個共享區域中,每次syscall直接讀取結果即可,降低了開銷
	 * VDSO的全稱為: virtual dynamic share object
	 */
	tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot); // monotonic_to_boot是offs_boot的VDSO形式
    																																	// 是為了加速訪問offs_boot
}

tk_update_sleep_time()的核心語句為tk->offs_boot = ktime_add(tk->offs_boot, delta);,delta是什么,從哪里來,需要在內核代碼中查找tk_update_sleep_time()的調用位置和參數意義.

static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
					   const struct timespec64 *delta)
{
    ...
    // 設置tk中的CLOCK_REALTIME時間,並記錄誤差
    tk_xtime_add(tk, delta); // 該函數內容為: tk->xtime_sec += delta->tv_sec;
    												   //                              tk->tkr_mono.xtime_nsec += delta->tv_nsec << tk->tkr_mono.shift;
    												  // xtime_sec:  以秒為單位的當前CLOCK_REALTIME時間
    												 // tkr: timekeeping read, 用於讀出時間的結構體.
    												// tkr_mono.xtime_nsec 讀出時間肯定存在誤差, xtime_nsec是讀出時間的ns級誤差
    ...	
    tk_update_sleep_time(tk, timespec64_to_ktime(*delta));
    ...
}

在__timekeeping_inject_sleeptime()中還是看不到delta的取值,繼續找.

void timekeeping_resume(void)
{
    struct timekeeper *tk = &tk_core.timekeeper;
	struct clocksource *clock = tk->tkr_mono.clock;
	unsigned long flags;
	struct timespec64 ts_new, ts_delta;
	u64 cycle_now, nsec;
	bool inject_sleeptime = false;

	read_persistent_clock64(&ts_new); // 讀取新的wallclock到ts_new

	clockevents_resume(); // 繼續時鍾事件
	clocksource_resume(); // 繼續時鍾源

	raw_spin_lock_irqsave(&timekeeper_lock, flags);
	write_seqcount_begin(&tk_core.seq);

	/*
	 * After system resumes, we need to calculate the suspended time and
	 * compensate it for the OS time. There are 3 sources that could be
	 * used: Nonstop clocksource during suspend, persistent clock and rtc
	 * device.
	 *
	 * One specific platform may have 1 or 2 or all of them, and the
	 * preference will be:
	 *	suspend-nonstop clocksource -> persistent clock -> rtc
	 * The less preferred source will only be tried if there is no better
	 * usable source. The rtc part is handled separately in rtc core code.
	 */
	cycle_now = tk_clock_read(&tk->tkr_mono); // 獲得當前時間
	nsec = clocksource_stop_suspend_timing(clock, cycle_now); // 獲得suspend的總時間
	if (nsec > 0) {
		ts_delta = ns_to_timespec64(nsec);
		inject_sleeptime = true;
	} else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) {
		ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time);
		inject_sleeptime = true;
	}

	if (inject_sleeptime) {
		suspend_timing_needed = false;
		__timekeeping_inject_sleeptime(tk, &ts_delta); // 原來delta指的是suspend的時間
	}
}

我們一路追溯的delta原來是suspend的時間,畫出回溯圖:

內核初始化wallclock

現在我們知道,內核如何獲取wallclock,是靠x86_platform.get_wallclock,我們也知道,內核如何更新wallclock,是在系統suspend之后,resume之前,利用__timekeeping_inject_sleeptime()修改tk_core的內容,修改了wallclock.

但是,系統初始化時,wallclock肯定就被設置了,那么wallclock是如何被初始化的呢?猜測在timekeeping_init相關的函數中.

void __init timekeeping_init(void)
{
    ...
        read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); // 讀取walltime和bootOffset到這倆變量
    ...
}
void __weak __init
read_persistent_wall_and_boot_offset(struct timespec64 *wall_time,
				     struct timespec64 *boot_offset)
{
	read_persistent_clock64(wall_time); // 讀取walltime
	*boot_offset = ns_to_timespec64(local_clock());
}
/* not static: needed by APM */
void read_persistent_clock64(struct timespec64 *ts)
{
	x86_platform.get_wallclock(ts);
}

是不是很熟悉,x86_platform.get_wallclock(ts);在kvm中,x86_platform.get_wallclock = kvm_get_wallclock,在host上,x86_platform.get_wallclock = vrtc_get_time.

void vrtc_get_time(struct timespec64 *now)
{
    u8 sec, min, hour, mday, mon;
	unsigned long flags;
	u32 year;

	spin_lock_irqsave(&rtc_lock, flags);

	while ((vrtc_cmos_read(RTC_FREQ_SELECT) & RTC_UIP))
		cpu_relax();

	sec = vrtc_cmos_read(RTC_SECONDS);
	min = vrtc_cmos_read(RTC_MINUTES);
	hour = vrtc_cmos_read(RTC_HOURS);
	mday = vrtc_cmos_read(RTC_DAY_OF_MONTH);
	mon = vrtc_cmos_read(RTC_MONTH);
	year = vrtc_cmos_read(RTC_YEAR);

	spin_unlock_irqrestore(&rtc_lock, flags);

	/* vRTC YEAR reg contains the offset to 1972 */
	year += 1972;

	pr_info("vRTC: sec: %d min: %d hour: %d day: %d "
		"mon: %d year: %d\n", sec, min, hour, mday, mon, year);

	now->tv_sec = mktime64(year, mon, mday, hour, min, sec);
	now->tv_nsec = 0;
}

可以看到,walltime的值包含了年月日時分秒,讀取自rtc_cmos時鍾中.

那么,結論就來了,guest的kvmclock的wallclock來自於RTC時鍾, 且該wallclock由所有vcpu共享,如果vcpu想獲得wallclock,就得寫屬於自己的msr_wall_clock.每當wallclock的內容更新,所有vcpu都能讀到最新wallclock,而不是只有寫msr_wall_clock的那個vcpu可以讀到.

systemTime的初始化

從kvmclock驅動角度來看,在kvmclock_init()中,,就將vcpu0和其余vcpu的vpti結構的物理地址,通過write msr寫到了各自的system_time_msr中.

static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
#define HVC_BOOT_ARRAY_SIZE \
	(PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info)) // 表示一個page能放多少個pvti結構
static struct pvclock_vsyscall_time_info
			hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE); // pvti結構數組

void __init kvmclock_init(void)
{
    ...
        // 獲得msr_system_time和msr_wall_clock
    	msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
		msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
    
    ...
        this_cpu_write(hv_clock_per_cpu, &hv_clock_boot[0]); // 將pvti結構數組中的第一個元素地址給hv_clock_per_cpu
    	kvm_register_clock("primary cpu clock");  // 將hv_clock_per_cpu的物理地址寫入對應system_time_msr
  	    pvclock_set_pvti_cpu0_va(hv_clock_boot); // pvti_cpu0_va = hv_clock_boot, 將hv_clock_boot地址作為cpu0的pvti地址
#ifdef CONFIG_X86_LOCAL_APIC
	x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock; // 在smp_init中,調用kvm_register_clock初始化除cpu0以外的cpu時鍾, 也因此將各自的hv_clock_per_cpu的物理地址傳入了對應的system_time_msr
#endif
    
    ...
        
        clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);// 注冊1GHz的kvmclock作為一個clocksource
    	
}

static void kvm_setup_secondary_clock(void)
{
	kvm_register_clock("secondary cpu clock");
}

static void kvm_register_clock(char *txt)
{
	struct pvclock_vsyscall_time_info *src = this_cpu_hvclock();
	u64 pa;

	if (!src)
		return;

	pa = slow_virt_to_phys(&src->pvti) | 0x01ULL; // 確保pvti結構的bit0為1
	wrmsrl(msr_kvm_system_time, pa); // 將該cpu的pvti結構的物理地址通過寫msr的方式寫入對應msr_kvm_system_time
    
	pr_info("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt);
}

現在我們知道了,在kvmclock_init()中,會用寫msr的方式,將各cpu的pvti結構的物理地址寫入各自對應的system_time_msr, 這里要追溯兩條線索:

一條線索向上,找出在何時調用kvmclock_init()進而將cpu0的pvti結構的物理地址寫入對應msr,以及何時調用x86_cpuinit.early_percpu_clock_init,將其余cpu的pvti結構的物理地址寫入對應msr.

另一條線索向下,當guest kernel中發生寫msr時,會導致wrmsr_vmexit,研究在該vmexit中,會怎樣處理對應msr.

可以看到,在guest啟動內核時就調用了kvmclock_init(),將vcpu0的pvti結構的物理地址寫入了對應msr, 並注冊了將其余vcpu的pvti結構的物理地址寫入對應msr的回調函數kvm_setup_secondary_clock.

接下來看何時調用x86_cpuinit.early_percpu_clock_init.

至此,所有vcpu的pvti的物理地址寫入msr路徑已經搞清楚,接下來看另一條線索,即當寫msr動作發生時,觸發vmexit,在handle_wrmsr中如何處理system_time. 與wallclock類似,也經歷了以下調用過程.

handle_wrmsr=>kvm_set_msr=>kvm_x86_ops->set_msr=>vmx_set_msr=>kvm_set_msr_common

int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
	...
        case MSR_KVM_SYSTEM_TIME_NEW:
		case MSR_KVM_SYSTEM_TIME: {
			struct kvm_arch *ka = &vcpu->kvm->arch;

			kvmclock_reset(vcpu); // 將該vcpu的pv_time_enabled標志置為false

            // 如果是vcpu0, 那么就將tmp設置為表示是否使用的舊的kvmclock
			if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) { // host_initiated在handle_wrmsr()中會被置false
			bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
			
            // 如果沒有使用舊的kvmclock,則發出KVM_REQ_MASTERCLOCK_UPDATE請求
			if (ka->boot_vcpu_runs_old_kvmclock != tmp)
				kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);

			ka->boot_vcpu_runs_old_kvmclock = tmp;
		}
			// 將該vcpu的pvti的物理地址值賦值給該vpcu的arch.time,並發出KVM_REQ_GLOBAL_CLOCK_UPDATE請求
			vcpu->arch.time = data;
			kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);

			/* we verify if the enable bit is set... */
			if (!(data & 1)) // 確保pvti的bit0不為0,如果為0,將不使用kvmclock
				break;

			if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
		  	   &vcpu->arch.pv_time, data & ~1ULL,
		  	   sizeof(struct pvclock_vcpu_time_info)))
				vcpu->arch.pv_time_enabled = false;
			else
				vcpu->arch.pv_time_enabled = true;

			break;
	}
}

即,如果運行的是vcpu0,且是否使用舊的kvmclock msr與當前的boot_vcpu_runs_old_kvmclock標志不一致,那么一定是出了一些什么問題,需要校准MASTERCLOCK,發出KVM_REQ_MASTERCLOCK_UPDATE請求.然后進行普通vcpu的操作.

普通vcpu的操作: 如果運行的是其它vcpu,那么只需要將該vcpu的pvti的物理地址值賦值給該vcpu的arch.time,並發出KVM_REQ_GLOBAL_CLOCK_UPDATE請求(也就是說,vcpu0有可能連續發出兩個REQUEST).之后根據kvm_gfn_to_hva_cache_init的結果將pv_time_enabled置為true或false.看一下kvm_gfn_to_hva_cache_init函數.

int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
			      gpa_t gpa, unsigned long len)
{
	struct kvm_memslots *slots = kvm_memslots(kvm); //為vcpu->arch.pv_time從memory中分配一些slots
	return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len); // 將vcpu->arch.pv_time的cache初始化,賦值,檢查有效性
}

static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
				       struct gfn_to_hva_cache *ghc,
				       gpa_t gpa, unsigned long len)
{
	int offset = offset_in_page(gpa); // 獲取pvti的物理地址在page中的offset
	gfn_t start_gfn = gpa >> PAGE_SHIFT; // 獲取pvti的物理地址的起始guest frame number
	gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; // 獲取pvti的物理地址的終止guest frame number
	gfn_t nr_pages_needed = end_gfn - start_gfn + 1;  // 所需的page數量
	gfn_t nr_pages_avail;
	int r = start_gfn <= end_gfn ? 0 : -EINVAL; // 正常情況下r==0

	ghc->gpa = gpa;
	ghc->generation = slots->generation; // slots的代數,用於分辨存儲的內容為第幾代
	ghc->len = len;
	ghc->hva = KVM_HVA_ERR_BAD;

	/*
	 * If the requested region crosses two memslots, we still
	 * verify that the entire region is valid here.
	 */
	while (!r && start_gfn <= end_gfn) { // 確保申請的slots有效
		ghc->memslot = __gfn_to_memslot(slots, start_gfn);
		ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
					   &nr_pages_avail); // 為存放pvti的緩存分配對應的host 虛擬地址
		if (kvm_is_error_hva(ghc->hva))
			r = -EFAULT;
		start_gfn += nr_pages_avail;
	}

	/* Use the slow path for cross page reads and writes. */
	if (!r && nr_pages_needed == 1)
		ghc->hva += offset;
	else
		ghc->memslot = NULL;

	return r; // 正常情況下return0
}

可以看出,kvm_gfn_to_hva_cache_init(vcpu->kvm,
&vcpu->arch.pv_time, data & ~1ULL,
sizeof(struct pvclock_vcpu_time_info)) 為vcpu->arch.pv_time申請了cache空間(對應host的虛擬地址),將pvti的物理地址寫入了該cache的gpa字段.

所以,在分配host虛擬地址成功的情況下,vcpu->arch.pv_time_enabled肯定為true.

綜上,在各vcpu啟動后,將各vcpu的pvti結構的物理地址寫入msr_system_time_i,並開辟緩存空間,用於打通host和guest.

各vcpu的pvti結構只需要一次wrmsr便可與host虛擬地址關聯,之后無需wrmsr,host不定期寫入pvti的最新值.

guest從pvti結構讀取system time

guest從pvti結構讀取system time的觸發點為上面提到的3種request:

KVM_REQ_MASTERCLOCK_UPDATE

KVM_REQ_GLOBAL_CLOCK_UPDATE

KVM_REQ_CLOCK_UPDATE

那么, guest kernel中什么時候發出這3種REQUEST呢?逐個來看.

三大更新時間請求的觸發點

KVM_REQ_MASTERCLOCK_UPDATE

  1. 當masterclock被使能,就一直發出KVM_REQ_MASTERCLOCK_UPDATE請求,以更新masterclock. 這樣情況的代碼在kvm_track_tsc_matching中.

masterclock何時可以被使能:

  • host clocksource必須為tsc
  • vcpus必須有matched tsc,即vcpus的v_tsc必須與host_tsc頻率一直

調用路徑一共有2條:

第一條為:(由底層函數向頂層函數追溯)

kvm_track_tsc_matching => kvm_write_tsc => kvm_set_msr_common寫MSR_IA32_TSC

即在guest os運行過程中,如果出現kvm_set_msr_common(MSR_IA32_TSC), 且滿足masterclock使能條件,且masterclock使能,則發出KVM_REQ_MASTERCLOCK_UPDATE請求

第二條為:(由底層函數向頂層函數追溯)

kvm_track_tsc_matching => kvm_write_tsc => kvm_arch_vcpu_postcreate => kvm_vm_ioctl_create_vcpu

即在創建vcpu時,滿足masterclock使能條件,且masterclock使能,則發出KVM_REQ_MASTERCLOCK_UPDATE請求.

  1. 寫MSR_KVM_SYSTEM_TIME/MSR_KVM_SYSTEM_TIME_NEW時,如果使用的是新版kvmclock,即寫的是MSR_KVM_SYSTEM_TIME_NEW, 則發出KVM_REQ_MASTERCLOCK_UPDATE.這是systemTime的初始化期間的一段.

  2. 在pvclock_gtod_update_fn中,對所有vcpu發出了KVM_REQ_MASTERCLOCK_UPDATE.而pvclock_gtod_update_fn的調用路徑為:

static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);

/*
 * Notification about pvclock gtod data update.
 */
static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
			       void *priv)
{
	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
	struct timekeeper *tk = priv;

	update_pvclock_gtod(tk);

	/* disable master clock if host does not trust, or does not
	 * use, TSC based clocksource.
	 */
	if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
	    atomic_read(&kvm_guest_has_master_clock) != 0) // guest clocksource從TSC變為了非TSC時
		queue_work(system_long_wq, &pvclock_gtod_work); // 將pvclock_gtod_work放入工作隊列

	return 0;
}


static struct notifier_block pvclock_gtod_notifier = {
	.notifier_call = pvclock_gtod_notify,
};

int kvm_arch_init(void *opaque)
{
    pvclock_gtod_register_notifier(&pvclock_gtod_notifier); // 將pvclock_gtod_notifier注冊為一個時間更新listener,每當host更新時間, 就會調用pvclock_gtod_notifier進而調用pvclock_gtod_notify
}

即當host更新時間,且kvm發現guest的clocksource從TSC變為非TSC時,發出KVM_REQ_MASTERCLOCK_UPDATE請求.

4.在kvm_arch_hardware_enable中,發現guest tsc發生了倒退,那么向所有vcpu發出KVM_REQ_MASTERCLOCK_UPDATE請求.

KVM_REQ_GLOBAL_CLOCK_UPDATE

  1. 在kvmclock驅動初始化時,kvmclock_init()中的kvm_register_clock觸發wrmsr進而調用kvm_set_msr_common寫MSR_KVM_SYSTEM_TIME/MSR_KVM_SYSTEM_TIME_NEW, 發出KVM_REQ_GLOBAL_CLOCK_UPDATE請求

  2. 在做從vcpu到pcpu(物理cpu)的遷移時,如果guest的tsc不一致,則需要發KVM_REQ_GLOBAL_CLOCK_UPDATE請求.

KVM_REQ_CLOCK_UPDATE

  1. kvm_gen_update_masterclock中,對所有vcpu發出KVM_REQ_CLOCK_UPDATE請求.而kvm_gen_update_masterclock為KVM_REQ_MASTERCLOCK_UPDATE請求的handler.

  2. 在kvmclock_update_fn函數中對所有vcpu發出KVM_REQ_CLOCK_UPDATE請求,kvmclock_update_fn的調用順序為:

    kvm_arch_init_vm()
    {
        // 初始化延時作業, 將kvmclock_update_fn注冊為kvm->arch.kvmclock_update_work的回調函數
        INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
        // 初始化延時作業,將kvmclock_sync_fn注冊為kvm->arch.kvmclock_sync_work的回調函數
        INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
    }
    
    static void kvmclock_sync_fn(struct work_struct *work)
    {
        // 立即調用kvmclock_update_work->kvmclock_update_fn
        schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); 
        schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
    					KVMCLOCK_SYNC_PERIOD); // 300 s后重新調用kvmclock_sync_work->kvmclock_sync_fn
    }
    
    static void kvmclock_update_fn(struct work_struct *work)
    {
      	kvm_for_each_vcpu(i, vcpu, kvm) {
    		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); // kvm_guest_time_update()
    		kvm_vcpu_kick(vcpu); // kick a vcpu to sleep. or make a guest mode vcpu into host kernel mode.
    	}  
    }
    

    即在kvmclock的同步函數中定義了立即作業(更新kvmclock),和延時作業(同步kvmclock).也就是說,kvm第一次調用同步kvmclock函數后,每300s更新和同步一次kvmclock,每次更新kvmclock時都發出KVM_REQ_CLOCK_UPDATE請求.

  3. kvm_gen_kvmclock_update中,對當前vcpu發出KVM_REQ_CLOCK_UPDATE請求,100ms后調用更新kvmclock函數kvmclock_update_fn,后者對所有vcpu發出KVM_REQ_CLOCK_UPDATE請求.kvm_gen_kvmclock_update是KVM_REQ_GLOBAL_CLOCK_UPDATE請求的handler.

    static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
    {
    	struct kvm *kvm = v->kvm;
    
    	kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); // 立即發送KVM_REQ_CLOCK_UPDATE請求
    	schedule_delayed_work(&kvm->arch.kvmclock_update_work,
    					KVMCLOCK_UPDATE_DELAY); // 100ms后觸發kvmclock_update_fn
    }
    
  4. kvm_arch_vcpu_load中,如果檢測到了外部tsc_offset_adjustment,就發出KVM_REQ_CLOCK_UPDATE請求.即在切換到特定vcpu時,做檢測並決定是否發出KVM_REQ_CLOCK_UPDATE請求.

  5. kvm_set_guest_paused中,會發出KVM_REQ_CLOCK_UPDATE請求,kvm_set_guest_paused告訴guest kernel,該guest kernel已經被kvm停止了.即在guest kernel pause時,發出KVM_REQ_CLOCK_UPDATE請求.

  6. 在qemu發出KVM_SET_CLOCK的ioctl時,向所有vcpu發出KVM_REQ_CLOCK_UPDATE請求.qemu設置時鍾時,更新guest時鍾是理所應當的事情.

  7. 在__kvmclock_cpufreq_notifier中,對所有vcpu發出了KVM_REQ_CLOCK_UPDATE.因為該函數為cpu頻率變化時的回調函數,當host cpu頻率變化時,應該重新設置guest的時間.

  8. 在vmexit時,如果guest的tsc總是追上host的tsc,說明guest的tsc頻率高於host的tsc頻率,需要重新校准guest的時間.因此向當前vcpu發出KVM_REQ_CLOCK_UPDATE.

  9. kvm_arch_hardware_enable,如果host tsc不穩定,就對所有vcpu發出KVM_REQ_CLOCK_UPDATE請求.而kvm_arch_hardware_enable的調用路徑為:

    kvm_arch_hardware_enable => hardware_enable_nolock => kvm_starting_cpu

    ​ => kvm_resume

    也就是說,在kvm啟動vcpu和恢復vcpu的運行時,都需要發出KVM_REQ_CLOCK_UPDATE以調整時間.

三大請求的處理

在確定了各更新時間的請求的triger點之后,接下來看一下這些請求的handler究竟針對請求做了哪些處理.

3種請求均在vcpu_enter_guest(),即進入non-root之前做處理.

KVM_REQ_MASTERCLOCK_UPDATE

static void kvm_gen_update_masterclock(struct kvm *kvm)
{
#ifdef CONFIG_X86_64
	int i;
	struct kvm_vcpu *vcpu;
	struct kvm_arch *ka = &kvm->arch;

	spin_lock(&ka->pvclock_gtod_sync_lock);
	kvm_make_mclock_inprogress_request(kvm); // 發出KVM_REQ_MCLOCK_INPROGRESS請求,讓所有vcpu無法進入guest
	/* no guest entries from this point */
	pvclock_update_vm_gtod_copy(kvm);//確認guest能否使用master_clock(用於vcpu之間的時間同步)

	kvm_for_each_vcpu(i, vcpu, kvm)
		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); // 向所有vcpu發出KVM_REQ_CLOCK_UPDATE請求

	/* guest entries allowed */
	kvm_for_each_vcpu(i, vcpu, kvm)
		kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); // 清除KVM_REQ_MCLOCK_INPROGRESS請求,讓所有vcpu進入guest

	spin_unlock(&ka->pvclock_gtod_sync_lock);
#endif
}

static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
{
#ifdef CONFIG_X86_64
	struct kvm_arch *ka = &kvm->arch;
	int vclock_mode;
	bool host_tsc_clocksource, vcpus_matched;

	vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
			atomic_read(&kvm->online_vcpus)); // 表示vcpus的tsc頻率是否match

	/*
	 * If the host uses TSC clock, then passthrough TSC as stable
	 * to the guest.
	 */
	host_tsc_clocksource = kvm_get_time_and_clockread(
					&ka->master_kernel_ns,
					&ka->master_cycle_now); // 如果host使用tsc,host_tsc_clocksource為true
    																	  // master_kernel_ns為master中記錄的host boot以來的時間
    																	 // master_cycle_now為master中記錄的host的當前tsc取值

	ka->use_master_clock = host_tsc_clocksource && vcpus_matched
				&& !ka->backwards_tsc_observed
				&& !ka->boot_vcpu_runs_old_kvmclock; // backwards_tsc_observed表示是否觀察到tsc倒退現象
    																								 // boot_vcpu_runs_old_kvmclock表示kvmclock使用舊的MSR

	if (ka->use_master_clock)
		atomic_set(&kvm_guest_has_master_clock, 1); // 如果use_master_clock為1,就將kvm_guest_has_master_clock設為1

	vclock_mode = pvclock_gtod_data.clock.vclock_mode;
	trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
					vcpus_matched);
#endif
}

可以看到,對於KVM_REQ_MASTERCLOCK_UPDATE請求,kvm做了兩件事情,一件事情是確認guest能否使用master_clock(用於vcpu之間的時間同步),另一件事情是對所有vcpu發出了更基本的請求,即KVM_REQ_CLOCK_UPDATE請求(在KVM_REQ_CLOCK_UPDATE的處理中說明).

KVM_REQ_GLOBAL_CLOCK_UPDATE

static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
{
	struct kvm *kvm = v->kvm;

	kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); // 立即發送KVM_REQ_CLOCK_UPDATE請求
	schedule_delayed_work(&kvm->arch.kvmclock_update_work,
					KVMCLOCK_UPDATE_DELAY); // 100ms后觸發kvmclock_update_fn
}

對於KVM_REQ_GLOBAL_CLOCK_UPDATE請求, kvm首先對當前vcpu發送了更基本請求,即KVM_REQ_CLOCK_UPDATE請求,在發出請求后100ms,調用kvmclock_update_fn,kvmclock_update_fn的作用是對所有vcpu發出KVM_REQ_CLOCK_UPDATE請求.

也就是說,KVM_REQ_GLOBAL_CLOCK_UPDATE的處理為:

  1. 向當前vcpu發送KVM_REQ_CLOCK_UPDATE請求
  2. 向所有vcpu發送KVM_REQ_CLOCK_UPDATE請求,並kick所有vcpu.

KVM_REQ_CLOCK_UPDATE

從上面的兩種請求的處理可以看到,上面兩種請求都以來基礎請求KVM_REQ_CLOCK_UPDATE,因此KVM_REQ_CLOCK_UPDATE的處理非常重要.

static int kvm_guest_time_update(struct kvm_vcpu *v)
{
	unsigned long flags, tgt_tsc_khz;
	struct kvm_vcpu_arch *vcpu = &v->arch;
	struct kvm_arch *ka = &v->kvm->arch;
	s64 kernel_ns;
	u64 tsc_timestamp, host_tsc;
	u8 pvclock_flags;
	bool use_master_clock;

	kernel_ns = 0;
	host_tsc = 0;

	/*
	 * If the host uses TSC clock, then passthrough TSC as stable
	 * to the guest.
	 */
	spin_lock(&ka->pvclock_gtod_sync_lock);
	use_master_clock = ka->use_master_clock;
	if (use_master_clock) { // 如果host使用tsc clock,直接將tsc傳遞給guest即可
		host_tsc = ka->master_cycle_now; // 將master的cycle_now記為host_tsc (tsc數)
		kernel_ns = ka->master_kernel_ns; // 將master的kernel_ns記為kernel_ns (master的kernel 納秒數) host boot以來的時間
	}
	spin_unlock(&ka->pvclock_gtod_sync_lock);

	/* Keep irq disabled to prevent changes to the clock */
	local_irq_save(flags);
	tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz); // 讀取當前vcpu的tsc值
	if (unlikely(tgt_tsc_khz == 0)) { // 如果當前vcpu的tsc值無效,則發出KVM_REQ_CLOCK_UPDATE請求
		local_irq_restore(flags);
		kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
		return 1;
	}
	if (!use_master_clock) {//如果host不使用tsc clock,
		host_tsc = rdtsc(); // 通過手動讀取tsc的值獲得host的tsc值
		kernel_ns = ktime_get_boottime_ns(); // 獲得host kernel boot以來的時間
	}

	tsc_timestamp = kvm_read_l1_tsc(v, host_tsc); // 將host tsc的值通過scale和offset得到當前時間戳

	/*
	 * We may have to catch up the TSC to match elapsed wall clock
	 * time for two reasons, even if kvmclock is used.
	 *   1) CPU could have been running below the maximum TSC rate
	 *   2) Broken TSC compensation resets the base at each VCPU
	 *      entry to avoid unknown leaps of TSC even when running
	 *      again on the same CPU.  This may cause apparent elapsed
	 *      time to disappear, and the guest to stand still or run
	 *	very slowly.
	 */
	if (vcpu->tsc_catchup) {
		u64 tsc = compute_guest_tsc(v, kernel_ns); // 通過host boot以來的時間,計算理論guest tsc
		if (tsc > tsc_timestamp) { // 如果理論guest tsc比經過調整的host tsc大
			adjust_tsc_offset_guest(v, tsc - tsc_timestamp);//將offset調整為原offset+tsc-tsc_timestap
			tsc_timestamp = tsc; // 將tsc賦值給當前時間戳,使其保持一致
		}
	}

	local_irq_restore(flags);

	/* With all the info we got, fill in the values */

	if (kvm_has_tsc_control) // 如果支持tsc scaling
		tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz); // 將當前vcpu的tsc值做scale

	if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) { // 如果當前vcpu的tsc與vcpu記錄的硬件tsc值不同,則調整tsc scale值
		kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
				   &vcpu->hv_clock.tsc_shift,
				   &vcpu->hv_clock.tsc_to_system_mul);
		vcpu->hw_tsc_khz = tgt_tsc_khz;
	}

    // 向pvti類型的hv_clock賦值
	vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
	vcpu->last_guest_tsc = tsc_timestamp;

	/* If the host uses TSC clocksource, then it is stable */
	pvclock_flags = 0;
	if (use_master_clock)
		pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;

	vcpu->hv_clock.flags = pvclock_flags;

	if (vcpu->pv_time_enabled) // 如果使用半虛擬化(kvmclock)
		kvm_setup_pvclock_page(v); // 就將pv_clock的內容拷貝到pv_clock
	if (v == kvm_get_vcpu(v->kvm, 0))
		kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
	return 0;
}

kvm_guest_time_update()做了以下幾件事情:

  1. 獲取host的tsc value和host kernel boot以來的ns數
  2. 讀取當前vcpu的tsc value
  3. 經過一系列的校准,將最終時間賦值給vcpu->hv_clock
  4. 如果vcpu使能了半虛擬化,就調用kvm_setup_pvclock_page

來看kvm_setup_pvclock_page.

static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
{
    ...
        	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
				&vcpu->hv_clock,
				sizeof(vcpu->hv_clock)); // 將hv_clock的內容賦值到pv_time中去
    ...
}

這里的pv_time就是之前我們提到的每個vcpu都有1個的pvti結構.將將hv_clock的內容賦值到pv_time中去,即將最新時間更新到vcpu的pvti結構中去.

system time就這樣在pvti結構中被更新了.

host對system time的寫入

host對system time的寫入一般來說有2種情況,同步寫入和異步寫入.

同步寫入指的是周期性更新guest中system time的值,以和host時間保持一致.

異步寫入指的是在特殊事件發生時(如guest suspend時),更新guest中system time的值,防止guest中的時間出錯.

host對system time的同步寫入

kvm通過pvclock_gtod_register_notifier向timekeeper層注冊了一個回調pvclock_gtod_notify(在上面的三大請求trigger點的介紹中有提到),每當Host Kernel時鍾更新時(即timekeeping_update被調用時),就會調用pvclock_gtod_notify.

static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
			       void *priv)
{
	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
	struct timekeeper *tk = priv;

	update_pvclock_gtod(tk); // 更新pvclock的內容

	/* disable master clock if host does not trust, or does not
	 * use, TSC based clocksource.
	 */
	if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
	    atomic_read(&kvm_guest_has_master_clock) != 0) // clocksource從TSC變為了非TSC時
		queue_work(system_long_wq, &pvclock_gtod_work);

	return 0;
}

static struct pvclock_gtod_data pvclock_gtod_data;

static void update_pvclock_gtod(struct timekeeper *tk)
{
	struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
	u64 boot_ns;

	boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));

	write_seqcount_begin(&vdata->seq);

	/* copy pvclock gtod data */
	vdata->clock.vclock_mode	= tk->tkr_mono.clock->archdata.vclock_mode;
	vdata->clock.cycle_last		= tk->tkr_mono.cycle_last; //host時間更新時的clocksource的counter讀數
	vdata->clock.mask		= tk->tkr_mono.mask;
	vdata->clock.mult		= tk->tkr_mono.mult;
	vdata->clock.shift		= tk->tkr_mono.shift;

	vdata->boot_ns			= boot_ns; // host時間更新時的s部分,以ns記
	vdata->nsec_base		= tk->tkr_mono.xtime_nsec; // host時間更新時的wallclock的ns部分

	vdata->wall_time_sec            = tk->xtime_sec;// host時間更新時的wallclock的s部分

	write_seqcount_end(&vdata->seq);
}

pvclock_gtod_notify()完成了2件事情:

  1. 調用update_pvclock_gtod更新了pvclock_gtod_data
  2. 檢測host的clocksource是否變為了非tsc,如果變了則將作業pvclock_gtod_work入隊
static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
static void pvclock_gtod_update_fn(struct work_struct *work)
{
	struct kvm *kvm;

	struct kvm_vcpu *vcpu;
	int i;

	mutex_lock(&kvm_lock);
	list_for_each_entry(kvm, &vm_list, vm_list)
		kvm_for_each_vcpu(i, vcpu, kvm)
			kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
	atomic_set(&kvm_guest_has_master_clock, 0);
	mutex_unlock(&kvm_lock);
}

可以看到pvclock_gtod_work的實際函數pvclock_gtod_update_fn的作用為:

向所有vcpu發出KVM_REQ_MASTERCLOCK_UPDATE,而后者經過層層調用,更新每個vcpu的pvti結構中的時間數據.

也就是說,每當Host Kernel時鍾更新時,如果使用master_clock,kvm會更新每個vcpu的pvti時間.內核的代碼中使用tk_clock_read讀取clocksource當前counter,但是沒有發現上下文中有對讀取時間的cpu的限制.

host對system time的異步寫入

host對system time的異步寫入通過qemu實現,利用kvm_vm_ioctl(KVM_SET_CLOCK),與kvm發生交互.

而kvm中,KVM_SET_CLOCK的ioctl的定義如下:

case KVM_SET_CLOCK: {
		struct kvm_clock_data user_ns;
		u64 now_ns;

		r = -EFAULT;
		if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
			goto out;

		r = -EINVAL;
		if (user_ns.flags)
			goto out;

		r = 0;
		/*
		 * TODO: userspace has to take care of races with VCPU_RUN, so
		 * kvm_gen_update_masterclock() can be cut down to locked
		 * pvclock_update_vm_gtod_copy().
		 */
		kvm_gen_update_masterclock(kvm); // 確認guest能否使用masterclock,並向所有vcpu發出時間更新請求
		now_ns = get_kvmclock_ns(kvm); // 讀取當前cpu的時間
		kvm->arch.kvmclock_offset += user_ns.clock - now_ns; // 確認當前cpu和qemu傳入的cpu時間的offset
		kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE); // 利用新的offset對所有vcpu的時間進行更新
		break;
	}

可以看到,kvm_vm_ioctl(KVM_SET_CLOCK)做了以下幾件事情:

  1. 確認guest能否使用masterclock,並向所有vcpu發出時間更新請求
  2. 讀取當前cpu的時間(根據是否使用masterclock,讀取時間的方式不同)
  3. 計算當前cpu和qemu傳入的cpu時間的offset
  4. 利用新的offset對所有vcpu的時間進行更新

host對system time的異步寫入依賴qemu和kvm的交互kvm_vm_ioctl(KVM_SET_CLOCK).

masterclock: 由於我們的kvmclock依賴於Host Boot Time和Host TSC兩個量,即使Host TSC同步且Guest TSC同步,在pCPU0和pCPU1分別取兩者,前者的差值和后者的差值也可能不相等,並且誰大誰小都有可能,從而可能違反kvmclock的單調性。因此,我們通過只使用一份Master Copy,即Master Clock,來解決這個問題。

---update on 6.1 2020-----

由於對pvclock_gtod_data和各vcpu的pvti結構的更新之間的關系不太清楚,特此研究記錄.

與各vcpu的pvti結構對應的host虛擬地址的申請

在kvmclock驅動初始化時,kvmclock_init()中的kvm_register_clock觸發wrmsr進而調用kvm_set_msr_common寫MSR_KVM_SYSTEM_TIME/MSR_KVM_SYSTEM_TIME_NEW.

在kvm_set_msr_common()中最關鍵的一句話為:

if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
		     &vcpu->arch.pv_time, data & ~1ULL,
		     sizeof(struct pvclock_vcpu_time_info))) // 計算出寫入gpa為data時,對應的hva.
			 																					// 將hva,gpa,內存區域長度,對應的memslot地址,該memslot對應的generation都存入arch.pv_time中
			vcpu->arch.pv_time_enabled = false;
		else
			vcpu->arch.pv_time_enabled = true;

kvm_gfn_to_hva_cache_init的函數原型為:

int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
			      gpa_t gpa, unsigned long len);

其中,ghc為gfn_to_hva_cache結構體類型,意義為將guest frame number轉化成host virtual address的cache.定義為:

struct gfn_to_hva_cache {
	u64 generation; // cache的代數
	gpa_t gpa; // guest物理地址
	unsigned long hva; // host虛擬地址
	unsigned long len; //該cache的大小
	struct kvm_memory_slot *memslot; // 該cache對應的kvm memslot地址
};

kvm_gfn_to_hva_cache_init()函數的實現為:

int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
			      gpa_t gpa, unsigned long len)
{
	struct kvm_memslots *slots = kvm_memslots(kvm); // slots為整個kvm memslots的地址
	return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
}

static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
				       struct gfn_to_hva_cache *ghc,
				       gpa_t gpa, unsigned long len)
{
	int offset = offset_in_page(gpa); // 頁內offset
    /* 雖然gpa只是一個地址,但由於從gpa開始,需要len長度的空間,所以存在起始gfn和終點gfn */
	gfn_t start_gfn = gpa >> PAGE_SHIFT; // gpa對應的起始gfn(guest frame number)
	gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; // gpa對應的終點gfn
	gfn_t nr_pages_needed = end_gfn - start_gfn + 1; // 所需頁的數量
	gfn_t nr_pages_avail;
	int r = start_gfn <= end_gfn ? 0 : -EINVAL; // r判斷起始gfn和終點gfn的有效性

    /*  將guest物理地址,memslot的代數,所需存儲空間的長度,賦值給gfn_to_hva_cache結構 */
	ghc->gpa = gpa;
	ghc->generation = slots->generation;
	ghc->len = len;
    
	ghc->hva = KVM_HVA_ERR_BAD; // 先將host虛擬地址賦值為無效
    
    
    //-----------------------------下面開始為gpa找對應的hva,存儲到賦值給gfn_to_hva_cache結構中去

	/*
	 * If the requested region crosses two memslots, we still
	 * verify that the entire region is valid here.
	 */
	while (!r && start_gfn <= end_gfn) { // 如果請求的空間大小橫跨2個memslot,需要確定請求空間的有效性
		ghc->memslot = __gfn_to_memslot(slots, start_gfn);
		ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
					   &nr_pages_avail);
		if (kvm_is_error_hva(ghc->hva))
			r = -EFAULT;
		start_gfn += nr_pages_avail;
	}

	/* Use the slow path for cross page reads and writes. */
	if (!r && nr_pages_needed == 1)
		ghc->hva += offset; 
	else
		ghc->memslot = NULL;

	return r;
}

從上面的分析可以看出,以下這段語句的作用為:從kvm_memslots中申請大小為pvclock_vcpu_time_info結構大小的緩存空間,該緩存空間緩存的是物理地址為data中所存地址中指向的內容,該緩存空間對應的host虛擬地址為hva.

kvm_gfn_to_hva_cache_init(vcpu->kvm,
		     &vcpu->arch.pv_time, data & ~1ULL,
		     sizeof(struct pvclock_vcpu_time_info))

在kvmclock驅動初始化寫MSR_KVM_SYSTEM_TIME/MSR_KVM_SYSTEM_TIME_NEW時導致的kvm_set_msr_common()中,data就是每個vcpu都有的pvti結構.讓pvti有了一個host虛擬地址.

向各vcpu的pvti結構對應的host虛擬地址寫入時間

先不論時間從哪里來,肯定會向pvti結構寫入時間.

static int kvm_guest_time_update(struct kvm_vcpu *v)
{
    ...
    if (vcpu->pv_time_enabled) // 在kvm_set_msr_common中建立pvti的gpa對應的hva時就使能了
		kvm_setup_pvclock_page(v);
}

static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
{
    ...
    	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
				&vcpu->hv_clock,
				sizeof(vcpu->hv_clock)); // 將hv_clock(pvti結構體類型)寫到vcpu的gfn_to_hva_cache結構的hva去,即寫到了vcpu的pvti中去
}

int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
			   void *data, unsigned long len)
{
	return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
}

int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
				  void *data, unsigned int offset,
				  unsigned long len)
{
	struct kvm_memslots *slots = kvm_memslots(kvm);
	int r;
	gpa_t gpa = ghc->gpa + offset;

	BUG_ON(len + offset > ghc->len);

	if (slots->generation != ghc->generation)
		__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);

	if (unlikely(!ghc->memslot))
		return kvm_write_guest(kvm, gpa, data, len);

	if (kvm_is_error_hva(ghc->hva))
		return -EFAULT;

	r = __copy_to_user((void __user *)ghc->hva + offset, data, len); // 將數據寫入gfn_to_hva_cache的hva地址中去
	if (r)
		return -EFAULT;
	mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT);

	return 0;
}

綜上,下面這段語句的意義為:

將vcpu的hv_clock(pvti結構類型)數據寫入屬於該vcpu的pvti對應的host虛擬地址中去.

kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
				&vcpu->hv_clock,
				sizeof(vcpu->hv_clock)); 

pv_clock_gtod_data

pv_clock_gtod_data是一個全局變量, kvm會在每個host的tick時更新該變量的內容.

struct pvclock_gtod_data {
	seqcount_t	seq;

	struct { /* extract of a clocksource struct */
		int vclock_mode;
		u64	cycle_last;
		u64	mask;
		u32	mult;
		u32	shift;
	} clock;

	u64		boot_ns;
	u64		nsec_base;
	u64		wall_time_sec;
};

如何更新的呢?

kvm通過pvclock_gtod_register_notifier向timekeeper層注冊了一個回調pvclock_gtod_notify,每當Host Kernel時鍾更新時(即timekeeping_update被調用時),就會調用pvclock_gtod_notify,進而調用update_pvclock_gtod更新pvclock_gtod_data的值.

static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
			       void *priv)
{
	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
	struct timekeeper *tk = priv;

	update_pvclock_gtod(tk); // 更新pvclock_gtod_data的內容

	/* disable master clock if host does not trust, or does not
	 * use, TSC based clocksource.
	 */
	if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
	    atomic_read(&kvm_guest_has_master_clock) != 0) // clocksource從TSC變為了非TSC時
		queue_work(system_long_wq, &pvclock_gtod_work);

	return 0;
}

static struct pvclock_gtod_data pvclock_gtod_data;

static void update_pvclock_gtod(struct timekeeper *tk)
{
	struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
	u64 boot_ns;

	boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));

	write_seqcount_begin(&vdata->seq);

	/* copy pvclock gtod data */
	vdata->clock.vclock_mode	= tk->tkr_mono.clock->archdata.vclock_mode; // 時鍾源類型
	vdata->clock.cycle_last		= tk->tkr_mono.cycle_last; //host時間更新時的clocksource的counter讀數
	vdata->clock.mask		= tk->tkr_mono.mask;
	vdata->clock.mult		= tk->tkr_mono.mult;
	vdata->clock.shift		= tk->tkr_mono.shift;

	vdata			= boot_ns; // host時間更新時,用來獲得當前時間的基礎時間,以ns記
	vdata->nsec_base		= tk->tkr_mono.xtime_nsec; // host時間更新時的wallclock的ns部分

	vdata->wall_time_sec            = tk->xtime_sec;// host時間更新時的wallclock的s部分

	write_seqcount_end(&vdata->seq);
}

pvti結構的數據來源vcpu->hv_clock

目前已知:

  1. kvm在kvm_guest_time_update()中更新各vcpu的pvti結構時,是將當前時間賦值給該函數中的hv_clock,然后將hv_clock的內容寫入到pvti結構中去.
  2. kvm會在host的每個tick,即每次host更新時間時,將時間相關變量更新到全局變量pvclock_gtod_data中

由此推斷,hv_clock肯定跟pvclock_gtod_data有一定的關系.下面尋找他們之間的聯系.

首先,在kvm_guest_time_update()中會檢查是否使用master_clock即use_master_clock的值,根據該bool值的取值,當前時間的獲取方式也不同.

kvm中何時決定use_master_clock的值?暫時不做討論,這里只需要知道use_master_clock為1時,kvm只使用一份host tsc和guest tsc,其它vcpu復制之.

use_master_clock為True

如果use_master_clock為真,則讓:

host_tsc = ka->master_cycle_now;
kernel_ns = ka->master_kernel_ns;

那么 ka->master_cycle_now和ka->master_kernel_ns的意義是什么,何時被賦值的?

在pvclock_update_vm_gtod_copy()中,有對master_cycle_now和master_kernel_ns的賦值,以kvm_get_time_and_clockread()形式呈現,kvm_get_time_and_clockread通過do_monotonic_boot()和pvclock_gtod_data中的值來獲得master_kernel_ns的值,意義為自host boot 以來的ns數.

而do_monotonic_boot通過vgettsc=>read_tsc,獲得master_cycle_now的值,read_tsc通過對比rdtsc指令和pv_clock_gtod_data->clock.cycle_last的返回值,確定tsc value是否后退,如果后退,則返回pv_clock_gtod_data->clock.cycle_last的值,即上一次讀取tsc時的值,如果沒有后退,則返回rdtsc指令的結果.總之,master_cycle_now代表當前PCPU上的沒有后退的tsc值.

// kvm更新master_kernel_ns和master_cycle_now的函數
static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
{
    ...
        host_tsc_clocksource = kvm_get_time_and_clockread(
					&ka->master_kernel_ns,
					&ka->master_cycle_now);
}

static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
{
    ...
        return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns,
						      tsc_timestamp)); // 用do_monotonic_boot獲得master_kernel_ns,master_cycle_now.
}

static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp)
{
	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
	unsigned long seq;
	int mode;
	u64 ns;

	do {
		seq = read_seqcount_begin(&gtod->seq);
		ns = gtod->nsec_base;
		ns += vgettsc(tsc_timestamp, &mode);
		ns >>= gtod->clock.shift;
		ns += gtod->boot_ns;
	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
	*t = ns;

	return mode;
}

// 因為我們的host基本都用tsc,所以走的case是VCLOCK_TSC
static inline u64 vgettsc(u64 *tsc_timestamp, int *mode)
{
    ...
        case VCLOCK_TSC:
		*mode = VCLOCK_TSC;
		*tsc_timestamp = read_tsc();
		v = (*tsc_timestamp - gtod->clock.cycle_last) &
			gtod->clock.mask;
		break;
}

static u64 read_tsc(void)
{
	u64 ret = (u64)rdtsc_ordered();
	u64 last = pvclock_gtod_data.clock.cycle_last;

	if (likely(ret >= last))
		return ret;

	/*
	 * GCC likes to generate cmov here, but this branch is extremely
	 * predictable (it's just a function of time and the likely is
	 * very likely) and there's a data dependence, so force GCC
	 * to generate a branch instead.  I don't barrier() because
	 * we don't actually need a barrier, and if this function
	 * ever gets inlined it will generate worse code.
	 */
	asm volatile ("");
	return last;
}

pvclock_update_vm_gtod_copy()只在2個kvm代碼的2個地方有引用,1是kvm_arch_init_vm(),2是kvm_gen_update_masterclock().后者在2個地方有引用,1是kvm_arch_vm_ioctl(KVM_SET_CLOCK),2是在每次vcpu_enter_guest()時檢查到KVM_REQ_MASTERCLOCK_UPDATE請求時.

也就是說,ka->master_kernel_ns和ka->master_cycle_now會在kvm運行的3個地方更新:

  1. 初始化虛擬機(guest)時
  2. 每次進入non-root mode檢測到KVM_REQ_MASTERCLOCK_UPDATE請求時
  3. 在userspace(如qemu)主動發起更新時間的請求時

結論: 如果使用master_clock, host_tsc表示當前PCPU上的無回退的TSC值, kenel_ns表示自host啟動以來的ns數.也只有use_master_clock為真時,kvm維護的pvclock_gtod_data的內容才會起作用.

use_master_clock為False

如果use_master_clock為假,則讓:

host_tsc = rdtsc();
kernel_ns = ktime_get_boottime_ns();

rdtsc()會直接讀取當前PCPU的TSC值

ktime_get_boottime_ns()獲取自host boot以來的ns數(利用的是host kernel中的timekeeping結構).

結論: 如果不使用master_clock, host_tsc表示當前PCPU(不保證是否回退)的TSC值,kernel_ns表示自host啟動以來的ns數.

TSC校准系數的調整及pvti cache的最終賦值

在獲得host_tsc和kernel_ns后,利用kvm_read_l1_tsc獲得arch層面的tsc_offset和tsc_scale, 並利用這二者對host_tsc進行調整,賦值給tsc_timestamp,那么tsc_timestamp的意義就非常明顯了,即"本次計時的TSC時間戳".

tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);

之后將根據vcpu->tsc_catchup的取值,決定是否對arch層面的tsc_offset和tsc_scale進行調整.如果需要調整,根據上面的host_tsc計算此時tsc的理論值是多少,如果理論值比讀到的tsc值大,說明guest的tsc_offset和tsc_scale已經無法正確調整host_tsc的值了,需要修正.進而利用理論tsc和當前tsc進行的差值修正guest的tsc_offset和tsc_scale,並將理論tsc值賦值給本次計時的TSC時間戳.

if (vcpu->tsc_catchup) {
    u64 tsc = compute_guest_tsc(v, kernel_ns);
    if (tsc > tsc_timestamp) {
        adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
        tsc_timestamp = tsc;
    }
}

接下來做了:

  1. 如果支持TSC_SCALLING Feature, 就利用該feature調整本地vcpu的目標TSC頻率.
if (kvm_has_tsc_control)
		tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
  1. 如果本地vcpu的tsc頻率與目標tsc頻率不同,則重新調整hv_clock的shift和multi系數,以確保本地vcpu的tsc頻率與目標tsc頻率相等.
	if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
		kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
				   &vcpu->hv_clock.tsc_shift,
				   &vcpu->hv_clock.tsc_to_system_mul);
		vcpu->hw_tsc_khz = tgt_tsc_khz;
	}

之后將本次計時的TSC時間戳,正確的system time(在實際使用時要加上wallclock時間才是標准時間)賦值給hv_clock結構,並將本次計時的時間戳保存在該vcpu的last_guest_tsc變量中.

vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
vcpu->last_guest_tsc = tsc_timestamp;

然后在kvm_setup_pvclock_page()中將hv_clock中的值更新到pv_time cache中去,該cache在kvmclock_init => WRITE MSR => handle_wrmsr => kvm_set_msr_common => kvm_gfn_to_hva_cache_init 流程中被分配空間及host虛擬地址,每個vcpu都有一個pv_time cache,其中的gpa指向每個vcpu的pvti結構.

if (vcpu->pv_time_enabled)
		kvm_setup_pvclock_page(v); // 將hv_clock中的值賦值到pv_time cache中

以上, 對pvclock_gtod_data和各vcpu的pvti結構的更新之間的關系梳理基本完成.

基本思路遵循:

  1. 建立pvti結構的cache
  2. 在每次kvm更新時間時更新cache中的內容


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM