在OpenStack中,虛擬機的遷移類似分為三種,分別是冷遷移、熱遷移和故障遷移。
1.冷遷移
實現原理:使用原來所需的資源在目標節點上重新創建一個虛擬機。
雲主機冷遷移流程圖:
更詳細的過程圖:
發起雲主機冷遷移后,首先調用到的是nova/api/openstack/compute/migrate_server.py的_migrate函數:
@wsgi.response(202) @extensions.expected_errors((400, 403, 404, 409)) @wsgi.action('migrate') def _migrate(self, req, id, body): """Permit admins to migrate a server to a new host.""" context = req.environ['nova.context'] context.can(ms_policies.POLICY_ROOT % 'migrate') host = body["migrate"]["host"] instance = common.get_instance(self.compute_api, context, id) try: self.compute_api.resize(req.environ['nova.context'], instance, host=host) ........
這里的核心代碼是調用到了resize函數,openstack本身還有個resize功能,它的作用是對雲主機的配置進行升級,但只能往上升,冷遷移的流程跟resize工作流程一樣,只不過是flavor沒有發生改變。實現代碼是在nova/compute/api.py:
@check_instance_lock @check_instance_cell @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.STOPPED]) def resize(self, context, instance, flavor_id=None, clean_shutdown=True, host=None, **extra_instance_updates): # 檢查flavor是否是新flavor,如果是resize則進行配額預留分配、修改虛擬機的狀態、 # 提交遷移記錄和生成目的宿主機需滿足的條件對象specrequest ........ self.compute_task_api.resize_instance(context, instance, extra_instance_updates, scheduler_hint=scheduler_hint, flavor=new_instance_type, reservations=quotas.reservations or [], clean_shutdown=clean_shutdown, request_spec=request_spec, host=host)
這里的核心是調用resize_instance函數進行處理,該函數實現在nova/conductor/api.py,該函數再調用了nova/conductor/rpcapi.py文件中的migrate_server函數:
def migrate_server(self, context, instance, scheduler_hint, live, rebuild, flavor, block_migration, disk_over_commit, reservations=None, clean_shutdown=True, request_spec=None, host=None): # 根據版本號構建kw參數 return cctxt.call(context, 'migrate_server', **kw)
通過遠程調用conductor進程的migrate_server函數,此時進入nova/conductor/manage.py文件中的migrate_server函數:
def migrate_server(self, context, instance, scheduler_hint, live, rebuild, flavor, block_migration, disk_over_commit, reservations=None, clean_shutdown=True, request_spec=None, host=None): # 一些條件判斷,判斷是進行冷遷移還是熱遷移 if not live and not rebuild and flavor: # 冷遷移走這個邏輯 instance_uuid = instance.uuid with compute_utils.EventReporter(context, 'cold_migrate', instance_uuid): self._cold_migrate(context, instance, flavor, scheduler_hint['filter_properties'], reservations, clean_shutdown, request_spec, host=host) else: raise NotImplementedError()
核心代碼是調用了_cold_migrate函數:
@wrap_instance_event(prefix='conductor') def _cold_migrate(self, context, instance, flavor, filter_properties, reservations, clean_shutdown, request_spec, host=None): image = utils.get_image_from_system_metadata( instance.system_metadata) task = self._build_cold_migrate_task(context, instance, flavor, request_spec, reservations, clean_shutdown, host=host) task.execute()
構建遷移任務,然后執行任務:
def _build_cold_migrate_task(self, context, instance, flavor, request_spec, reservations, clean_shutdown, host=None): # nova/conductor/tasks/migrate.py return migrate.MigrationTask(context, instance, flavor, request_spec, reservations, clean_shutdown, self.compute_rpcapi, self.scheduler_client, host=host)
這里是返回了MigrationTask類實例,該類繼承的TaskBase基類的execute函數會調用_execute函數,所以我們直接看該MigrationTask類的execute函數實現即可:
def _execute(self): # 選擇一個宿主機 self.compute_rpcapi.prep_resize( self.context, self.instance, legacy_spec['image'], self.flavor, host, self.reservations, request_spec=legacy_spec, filter_properties=legacy_props, node=node, clean_shutdown=self.clean_shutdown)
這里調用到了nova/compute/rpcapi.py中的prep_resize函數:
def prep_resize(self, ctxt, instance, image, instance_type, host, reservations=None, request_spec=None, filter_properties=None, node=None, clean_shutdown=True): image_p = jsonutils.to_primitive(image) msg_args = {'instance': instance, 'instance_type': instance_type, 'image': image_p, 'reservations': reservations, 'request_spec': request_spec, 'filter_properties': filter_properties, 'node': node, 'clean_shutdown': clean_shutdown} version = '4.1' client = self.router.by_host(ctxt, host) if not client.can_send_version(version): version = '4.0' msg_args['instance_type'] = objects_base.obj_to_primitive( instance_type) cctxt = client.prepare(server=host, version=version) # 遠程調用到宿主機上讓其准備將要遷移過去的虛擬機的資源 cctxt.cast(ctxt, 'prep_resize', **msg_args)
這里遠程調用到了nova/compute/manager.py中的prep_resize函數,該函數的核心代碼是調用了_prep_resize:
def _prep_resize(self, context, image, instance, instance_type, quotas, request_spec, filter_properties, node, clean_shutdown=True): ......... rt = self._get_resource_tracker() # 這里進行了資源的檢查和預留分配並更新數據庫宿主機更新后的資源 with rt.resize_claim(context, instance, instance_type, node, image_meta=image, limits=limits) as claim: LOG.info(_LI('Migrating'), instance=instance) self.compute_rpcapi.resize_instance( context, instance, claim.migration, image, instance_type, quotas.reservations, clean_shutdown)
分配好資源后調用self.compute_rpcapi.resize_instance函數:
def resize_instance(self, ctxt, instance, migration, image, instance_type, reservations=None, clean_shutdown=True): msg_args = {'instance': instance, 'migration': migration, 'image': image, 'reservations': reservations, 'instance_type': instance_type, 'clean_shutdown': clean_shutdown, } version = '4.1' client = self.router.by_instance(ctxt, instance) if not client.can_send_version(version): msg_args['instance_type'] = objects_base.obj_to_primitive( instance_type) version = '4.0' cctxt = client.prepare(server=_compute_host(None, instance), version=version) # 到源主機的nova/compute/manager.py中執行resize_instance函數 cctxt.cast(ctxt, 'resize_instance', **msg_args)
進入源主機的resize_instance函數中:
def resize_instance(self, context, instance, image, reservations, migration, instance_type, clean_shutdown): # 獲取要遷移的雲主機網卡信息 # 修改數據庫雲主機狀態 # 遷移事件通知 # 關機並進行磁盤遷移 disk_info = self.driver.migrate_disk_and_power_off( context, instance, migration.dest_host, instance_type, network_info, block_device_info, timeout, retry_interval) # 開始為虛擬機遷移網絡 self.network_api.migrate_instance_start(context, instance, migration_p) self.compute_rpcapi.finish_resize(context, instance, migration, image, disk_info, migration.dest_compute, reservations=quotas.reservations)
此時遠程調用目的主機的finish_resize函數:
def finish_resize(self, context, disk_info, image, instance, reservations, migration): # 提交配額 ..... self._finish_resize(context, instance, migration, disk_info, image_meta) def _finish_resize(self, context, instance, migration, disk_info, image_meta): # 初始化網絡 self.network_api.setup_networks_on_host(context, instance, migration['dest_compute']) migration_p = obj_base.obj_to_primitive(migration) self.network_api.migrate_instance_finish(context, instance, migration_p) # 獲取當前雲主機的網絡信息 network_info = self.network_api.get_instance_nw_info(context, instance) # 更新數據庫狀態 instance.task_state = task_states.RESIZE_FINISH instance.save(expected_task_state=task_states.RESIZE_MIGRATED) # nova/virt/libvirt/driver.py self.driver.finish_migration(context, migration, instance, disk_info, network_info, image_meta, resize_instance, block_device_info, power_on)
最后還要再出發一次confirm resize函數完成整個冷遷移過程,該函數是確認在源主機上刪除雲主機的數據和網絡數據等,函數文件在nova/api/openstack/compute/servers.py:
@wsgi.action('confirmResize') def _action_confirm_resize(self, req, id, body): self.compute_api.confirm_resize(context, instance) def confirm_resize(self, context, instance, migration=None): """Confirms a migration/resize and deletes the 'old' instance.""" # 修改遷移狀態和更新配額 ...... self.compute_rpcapi.confirm_resize(context, instance, migration, migration.source_compute, quotas.reservations or [])
@wsgi.action('confirmResize') def _action_confirm_resize(self, req, id, body): self.compute_api.confirm_resize(context, instance) def confirm_resize(self, context, instance, migration=None): """Confirms a migration/resize and deletes the 'old' instance.""" # 修改遷移狀態和更新配額 ...... self.compute_rpcapi.confirm_resize(context, instance, migration, migration.source_compute, quotas.reservations or [])
通過rpc調用到源宿主機上進行confirm_resize,這函數中比較核心的部分是調用了_confirm_resize函數:
def _confirm_resize(self, context, instance, quotas, migration=None): """Destroys the source instance.""" self._notify_about_instance_usage(context, instance, "resize.confirm.start") # NOTE(tr3buchet): tear down networks on source host # 斷掉網絡 self.network_api.setup_networks_on_host(context, instance, migration.source_compute, teardown=True) network_info = self.network_api.get_instance_nw_info(context, instance) # TODO(mriedem): Get BDMs here and pass them to the driver. # 刪除虛擬機 self.driver.confirm_migration(context, migration, instance, network_info) # 更新遷移狀態 migration.status = 'confirmed' with migration.obj_as_admin(): migration.save() # 更新資源 rt = self._get_resource_tracker() rt.drop_move_claim(context, instance, migration.source_node, old_instance_type, prefix='old_') instance.drop_migration_context()
2.熱遷移
實現原理:熱遷移與冷遷移工作流程類似,但熱遷移由於是在運行中遷移,進行了比較多的兼容性判斷,比如兩個宿主機之間的cpu兼容性等。熱遷移其實並非沒有業務中斷,只是在遷移的最后時刻,虛擬機會有短暫掛起,快速完成最后一次內存復制。
影響熱遷移的關鍵因素有兩個:
(1)虛擬機內存臟頁的速度,迭代復制是以頁為單位的;
(2)網絡帶寬,如果臟頁的速度遠大於迭代復制內存頁的速度,在一段時間內遷移是不成功的。
libvirtd數據遷移邏輯:
(1)標記所有的臟內存;
(2)傳輸所有的臟內存,然后開始重新計算新產生的臟內存,如此迭代,知道某一個條件退出;
(3)暫停虛擬機,傳輸剩余數據;
第(2)步的某個條件可以是:
(1)50%或者更少的內存需要遷移;
(2)迭代次數不超過多少次;
熱遷移流程圖:
發起熱遷移請求后,nova中的入口函數是nova-api的migrate_server.py文件中的_migrate_live函數:
@wsgi.action('os-migrateLive') @validation.schema(migrate_server.migrate_live, "2.0", "2.24") @validation.schema(migrate_server.migrate_live_v2_25, "2.25", "2.29") @validation.schema(migrate_server.migrate_live_v2_30, "2.30") def _migrate_live(self, req, id, body): self.compute_api.live_migrate(context, instance, block_migration, disk_over_commit, host, force, async)
調用到nova/compute/api.py中:
def live_migrate(self, context, instance, block_migration, disk_over_commit, host_name, force=None, async=False): # 修改虛擬機狀態 # 生成request_spec ...... # nova/conductor/api.py self.compute_task_api.live_migrate_instance(context, instance, host_name, block_migration=block_migration, disk_over_commit=disk_over_commit, request_spec=request_spec, async=async)
live_migrate_instance又調用到了同級下的rpcapi.py中的live_migrate_instance函數:
def live_migrate_instance(self, context, instance, scheduler_hint, block_migration, disk_over_commit, request_spec): kw = {'instance': instance, 'scheduler_hint': scheduler_hint, 'block_migration': block_migration, 'disk_over_commit': disk_over_commit, 'request_spec': request_spec, } version = '1.15' cctxt = self.client.prepare(version=version) # 交到conductor進程中處理 cctxt.cast(context, 'live_migrate_instance', **kw) @wrap_instance_event(prefix='conductor') def live_migrate_instance(self, context, instance, scheduler_hint, block_migration, disk_over_commit, request_spec): self._live_migrate(context, instance, scheduler_hint, block_migration, disk_over_commit, request_spec) def _live_migrate(self, context, instance, scheduler_hint, block_migration, disk_over_commit, request_spec): destination = scheduler_hint.get("host") # 初始化一個熱遷移任務跟蹤對象 migration = objects.Migration(context=context.elevated()) migration.dest_compute = destination migration.status = 'accepted' migration.instance_uuid = instance.uuid migration.source_compute = instance.host migration.migration_type = 'live-migration' if instance.obj_attr_is_set('flavor'): migration.old_instance_type_id = instance.flavor.id migration.new_instance_type_id = instance.flavor.id else: migration.old_instance_type_id = instance.instance_type_id migration.new_instance_type_id = instance.instance_type_id migration.create() # 創建熱遷移任務 task = self._build_live_migrate_task(context, instance, destination, block_migration, disk_over_commit, migration, request_spec) task.execute() def _build_live_migrate_task(self, context, instance, destination, block_migration, disk_over_commit, migration, request_spec=None): # nova/conductor/tasks/live_migrate.py return live_migrate.LiveMigrationTask(context, instance, destination, block_migration, disk_over_commit, migration, self.compute_rpcapi, self.servicegroup_api, self.scheduler_client, request_spec)
我們可以直接看nova/conductor/tasks/live_migrate.py文件中的類LiveMigrationTask中的_execute函數實現:
def _execute(self): # 檢查雲主機是否active狀態 self._check_instance_is_active() # 檢查源主機宿主機是否可用狀態 self._check_host_is_up(self.source) # 如果沒有指定目的主機,則調用scheduler去選擇合適的目的主機 if not self.destination: self.destination = self._find_destination() self.migration.dest_compute = self.destination self.migration.save() else: # 如果指定了還需要對它做一些條件檢查,判斷是否目的主機滿足熱遷移 self._check_requested_destination() # TODO(johngarbutt) need to move complexity out of compute manager # TODO(johngarbutt) disk_over_commit? return self.compute_rpcapi.live_migration(self.context, host=self.source, instance=self.instance, dest=self.destination, block_migration=self.block_migration, migration=self.migration, migrate_data=self.migrate_data) def _check_requested_destination(self): # 確保源主機和宿主機不是同一物理主機 self._check_destination_is_not_source() # 檢查目的主機是否可用 self._check_host_is_up(self.destination) # 檢查目的主機是否有足夠的內存 self._check_destination_has_enough_memory() # 檢查兩個源主機和宿主機之間是否兼容 self._check_compatible_with_source_hypervisor(self.destination) # 檢查下是否可以對目的主機執行熱遷移操作 self._call_livem_checks_on_host(self.destination)
其中_call_livem_checks_on_host函數會遠程調用到目的主機上去執行check_can_live_migrate_destination函數來檢驗目的主機是否滿足熱遷移,同時目的主機也會遠程調用check_can_live_migrate_source函數檢查源主機是否支持熱遷移。
def _do_check_can_live_migrate_destination(self, ctxt, instance, block_migration, disk_over_commit): src_compute_info = obj_base.obj_to_primitive(self._get_compute_info(ctxt, instance.host)) dst_compute_info = obj_base.obj_to_primitive(self._get_compute_info(ctxt, CONF.host)) # nova/virt/libvirt/driver.py dest_check_data = self.driver.check_can_live_migrate_destination(ctxt, instance, src_compute_info, dst_compute_info, block_migration, disk_over_commit) LOG.debug('destination check data is %s', dest_check_data) try: # 遠程調用回源主機上檢查源主機是否可熱遷移 migrate_data = self.compute_rpcapi.\ check_can_live_migrate_source(ctxt, instance, dest_check_data) finally: # 刪除一些檢查時產生的臨時文件 self.driver.cleanup_live_migration_destination_check(ctxt, dest_check_data) return migrate_data
接着便是調用到了nova/compute/rpcapi.py中的live_migration函數,該函數遠程調用了nova-compute服務的live_migrate方法,交給nova-compute服務來進行處理:
def live_migration(self, context, dest, instance, block_migration, migration, migrate_data): self._do_live_migration(*args, **kwargs) def _do_live_migration(self, context, dest, instance, block_migration, migration, migrate_data): # nova/compute/rpcapi.py migrate_data = self.compute_rpcapi.pre_live_migration( context, instance, block_migration, disk, dest, migrate_data) self.driver.live_migration(context, instance, dest, self._post_live_migration, self._rollback_live_migration, block_migration, migrate_data)
_do_live_migration函數中的核心代碼是pre_live_migration和live_migration的調用。先看pre_live_migration函數,其是遠程調用到目的主機上執行pre_live_migration函數:
def pre_live_migration(self, context, instance, block_migration, disk, migrate_data): block_device_info = self._get_instance_block_device_info( context, instance, refresh_conn_info=True) network_info = self.network_api.get_instance_nw_info(context, instance) self._notify_about_instance_usage( context, instance, "live_migration.pre.start", network_info=network_info) # 做連接上磁盤和掛載上網絡的工作等 migrate_data = self.driver.pre_live_migration(context, instance, block_device_info, network_info, disk, migrate_data) LOG.debug('driver pre_live_migration data is %s', migrate_data) # NOTE(tr3buchet): setup networks on destination host # 初始化好網絡 self.network_api.setup_networks_on_host(context, instance, self.host) # Creating filters to hypervisors and firewalls. # An example is that nova-instance-instance-xxx, # which is written to libvirt.xml(Check "virsh nwfilter-list") # This nwfilter is necessary on the destination host. # In addition, this method is creating filtering rule # onto destination host. # 在熱遷移進行前在目的主機上創建好那些網絡過濾規則 self.driver.ensure_filtering_rules_for_instance(instance, network_info)
在目的主機上執行完pre_live_migration函數后,源主機上調用live_migration開始執行熱遷移操作,調用到nova/virt/libvirt/driver.py中的live_migration函數,再調用到_live_migration函數:
def _live_migration(self, context, instance, dest, post_method, recover_method, block_migration, migrate_data): opthread = utils.spawn(self._live_migration_operation, context, instance, dest, block_migration, migrate_data, guest, device_names) self._live_migration_monitor(context, instance, guest, dest, post_method, recover_method, block_migration, migrate_data, finish_event, disk_paths)
這里主要有兩個核心調用,一個是_live_migration_operation進行遷移操作,一個是調用_live_migration_monitor函數用以監控遷移進度。
def _live_migration_operation(self, context, instance, dest, block_migration, migrate_data, guest, device_names): # 里面調用libvirt的api來實現遷移 # nova/virt/libvirt/guest.py guest.migrate(self._live_migration_uri(dest), migrate_uri=migrate_uri, flags=migration_flags, params=params, domain_xml=new_xml_str, bandwidth=CONF.libvirt.live_migration_bandwidth) def migrate(self, destination, migrate_uri=None, params=None, flags=0, domain_xml=None, bandwidth=0): # 調用了libvirt的python接口virDomainMigrateToURI來實現從當前雲主機遷移domain對象到給定的目標主機 if domain_xml is None: self._domain.migrateToURI( destination, flags=flags, bandwidth=bandwidth) else: if params: if migrate_uri: # In migrateToURI3 this paramenter is searched in # the `params` dict params['migrate_uri'] = migrate_uri self._domain.migrateToURI3( destination, params=params, flags=flags) else: self._domain.migrateToURI2( destination, miguri=migrate_uri, dxml=domain_xml, flags=flags, bandwidth=bandwidth)
_live_migration_monitor的主要實現則是調用了libvirt的job_info函數獲取進度情況。
實現原理:根據在數據庫中保存的配置,重新生成一個一樣的雲主機,前提是需要能訪問到故障雲主機的磁盤數據,所以使用共享存儲可以實現雲主機故障遷移。
函數代碼文件:nova/api/openstack/compute/evacuate.py
發起故障遷移請求后進入的函數是:
def _evacuate(self, req, id, body): """Permit admins to evacuate a server from a failed host to a new one. """ self.compute_api.evacuate(context, instance, host, on_shared_storage, password, force)
執行nova/compute/api.py中的evacuate函數:
def evacuate(self, context, instance, host, on_shared_storage, admin_password=None, force=None): # 檢查源宿主機是否可用,可用則停止繼續執行 # 修改雲主機狀態和生成request_spec ..... # nova/conductor/api.py return self.compute_task_api.rebuild_instance(context, instance=instance, new_pass=admin_password, injected_files=None, image_ref=None, orig_image_ref=None, orig_sys_metadata=None, bdms=None, recreate=True, on_shared_storage=on_shared_storage, host=host, request_spec=request_spec, )
通過rpc調用會直接調用到目的宿主機的nova/compute/manager.py中的rebuild_instance的方法進行處理:
def rebuild_instance(self, context, instance, orig_image_ref, image_ref, injected_files, new_pass, orig_sys_metadata, bdms, recreate, on_shared_storage=None, preserve_ephemeral=False, migration=None, scheduled_node=None, limits=None): # 獲取物理主機資源 # 獲取鏡像元數據 self._do_rebuild_instance_with_claim( claim_ctxt, context, instance, orig_image_ref, image_ref, injected_files, new_pass, orig_sys_metadata, bdms, recreate, on_shared_storage, preserve_ephemeral)
_do_rebuild_instance_with_claim函數會調用到_do_rebuild_instance函數:
def _do_rebuild_instance(self, context, instance, orig_image_ref, image_ref, injected_files, new_pass, orig_sys_metadata, bdms, recreate, on_shared_storage, preserve_ephemeral): orig_vm_state = instance.vm_state # 傳進來的recreate和on_shared_storage參數都是true # 檢查是否是共享存儲 ..... if recreate: # nova/network/api.py # 設置主機上的網絡 self.network_api.setup_networks_on_host( context, instance, self.host) # For nova-network this is needed to move floating IPs # For neutron this updates the host in the port binding # TODO(cfriesen): this network_api call and the one above # are so similar, we should really try to unify them. self.network_api.setup_instance_network_on_host( context, instance, self.host) network_info = compute_utils.get_nw_info_for_instance(instance) # 獲取雲主機磁盤信息 if bdms is None: bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( context, instance.uuid) block_device_info = \ self._get_instance_block_device_info( context, instance, bdms=bdms) def detach_block_devices(context, bdms): for bdm in bdms: if bdm.is_volume: self._detach_volume(context, bdm.volume_id, instance, destroy_bdm=False) files = self._decode_files(injected_files) kwargs = dict( context=context, instance=instance, image_meta=image_meta, injected_files=files, admin_password=new_pass, bdms=bdms, detach_block_devices=detach_block_devices, attach_block_devices=self._prep_block_device, block_device_info=block_device_info, network_info=network_info, preserve_ephemeral=preserve_ephemeral, recreate=recreate) try: # 避免錯誤的資源追蹤 with instance.mutated_migration_context(): # libvirt驅動沒有重載rebuild函數,因此會調用到下面的_rebuild_default_impl默認實現 self.driver.rebuild(**kwargs) except NotImplementedError: # NOTE(rpodolyaka): driver doesn't provide specialized version # of rebuild, fall back to the default implementation # 處理磁盤斷開連接和重新連接后,最后調用self.driver.spawn重新生成一個雲主機 self._rebuild_default_impl(**kwargs) # 生成成功后更新雲主機在數據庫中的狀態數據 self._update_instance_after_spawn(context, instance) instance.save(expected_task_state=[task_states.REBUILD_SPAWNING]) if orig_vm_state == vm_states.STOPPED: LOG.info(_LI("bringing vm to original state: '%s'"), orig_vm_state, instance=instance) instance.vm_state = vm_states.ACTIVE instance.task_state = task_states.POWERING_OFF instance.progress = 0 instance.save() self.stop_instance(context, instance, False) self._update_scheduler_instance_info(context, instance) self._notify_about_instance_usage( context, instance, "rebuild.end", network_info=network_info, extra_usage_info=extra_usage_info)