記一次邏輯卷磁盤故障導致邏輯卷不可用的問題

本文轉載自查看原文 2020-07-06 16:19 1238

生產中一塊磁盤故障，由於沒有在系統中先通過--removemissing 操作，直接熱插拔，做了raid

系統無法重啟，進入救援模式，注釋掉受影響的磁盤掛載點，重啟進入系統之后，做了如下操作

# parted /dev/sdd mklabel gpt

首先是將新盤設置成gpt格式
# parted /dev/sdd mkpart primary 2048s 100%

然后是根據生產的環境划分一塊主分區

# partprobe /dev/sdd

# partprobe /dev/sdd1

動態更新/dev/sdd的信息，在不重啟服務器的情況下重讀分區

# cat /etc/lvm/backup/vg1 
# Generated by LVM2 version 2.02.171(2)-RHEL7 (2017-05-03): Tue Oct 30 13:58:05 2018

contents = "Text Format Volume Group"
version = 1

description = "Created *after* executing 'lvcreate -l 100%FREE -n lv1 vg1'"

creation_host = "msw8b0201"    # Linux msw8b0201 3.10.0-693.el7.x86_64 #1 SMP Thu Jul 6 19:56:57 EDT 2017 x86_64
creation_time = 1540879085    # Tue Oct 30 13:58:05 2018

vg1 {
    id = "TxN32k-8J83-thCH-UwnD-uKkH-phn8-3rCaKs"
    seqno = 2
    format = "lvm2"            # informational
    status = ["RESIZEABLE", "READ", "WRITE"]
    flags = []
    extent_size = 8192        # 4 Megabytes
    max_lv = 0
    max_pv = 0
    metadata_copies = 0

    physical_volumes {

        pv0 {
            id = "UnQm8F-10fW-erxa-O4Lv-I0RA-obiI-XiZKC2"
            device = "/dev/sdb1"    # Hint only

            status = ["ALLOCATABLE"]
            flags = []
            dev_size = 1873041408    # 893.136 Gigabytes
            pe_start = 2048
            pe_count = 228642    # 893.133 Gigabytes
        }

        pv1 {
            id = "brRjwc-g3lm-esHD-PA6s-oDPa-fzfN-n2vN4D"
            device = "/dev/sdc1"    # Hint only

            status = ["ALLOCATABLE"]
            flags = []
            dev_size = 1873041408    # 893.136 Gigabytes
            pe_start = 2048
            pe_count = 228642    # 893.133 Gigabytes
        }

        pv2 {
            id = "d3vWoV-zMwG-U6bg-X1Yz-T929-jVFd-2QTifo"
            device = "/dev/sdd1"    # Hint only

            status = ["ALLOCATABLE"]
            flags = []
            dev_size = 1873041408    # 893.136 Gigabytes
            pe_start = 2048
            pe_count = 228642    # 893.133 Gigabytes
        }

        pv3 {
            id = "84rd7v-Bblg-Xl3Y-4HD1-YvXS-Rd8L-100T90"
            device = "/dev/sde1"    # Hint only

            status = ["ALLOCATABLE"]
            flags = []
            dev_size = 1873041408    # 893.136 Gigabytes
            pe_start = 2048
            pe_count = 228642    # 893.133 Gigabytes
        }
    }

    logical_volumes {

        lv1 {
            id = "7m4Q3c-Gysj-MpxU-NeDt-76iI-sqiW-jF2qBr"
            status = ["READ", "WRITE", "VISIBLE"]
            flags = []
            creation_time = 1540879085    # 2018-10-30 13:58:05 +0800
            creation_host = "msw8b0201"
            segment_count = 4

            segment1 {
                start_extent = 0
                extent_count = 228642    # 893.133 Gigabytes

                type = "striped"
                stripe_count = 1    # linear

                stripes = [
                    "pv0", 0
                ]
            }
            segment2 {
                start_extent = 228642
                extent_count = 228642    # 893.133 Gigabytes

                type = "striped"
                stripe_count = 1    # linear

                stripes = [
                    "pv1", 0
                ]
            }
            segment3 {
                start_extent = 457284
                extent_count = 228642    # 893.133 Gigabytes

                type = "striped"
                stripe_count = 1    # linear

                stripes = [
                    "pv2", 0
                ]
            }
            segment4 {
                start_extent = 685926
                extent_count = 228642    # 893.133 Gigabytes

                type = "striped"
                stripe_count = 1    # linear

                stripes = [
                    "pv3", 0
                ]
            }
        }
    }

}

這一步是為了記住邏輯卷的各項信息，比如uuid，原始系統的狀態，預防新建vg lv覆蓋之前的信息

# parted /dev/sdd
GNU Parted 3.1
Using /dev/sdd
Welcome to GNU Parted! Type 'help' to view a list of commands.
(parted) mklabel gpt                                                      
Warning: The existing disk label on /dev/sdd will be destroyed and all data on
this disk will be lost. Do you want to continue?
Yes/No? y                                                                 
(parted) mkpart primary 2048s 100%
(parted) print                                                            
Model: AVAGO HW-SAS3508 (scsi)
Disk /dev/sdd: 959GB
Sector size (logical/physical): 512B/4096B
Partition Table: gpt
Disk Flags: 

Number  Start   End    Size   File system  Name     Flags
 1      1049kB  959GB  959GB  ext4         primary

(parted) rm                                                               
align-check  help         mktable      quit         select       unit
disk_set     mklabel      name         rescue       set          version
disk_toggle  mkpart       print        rm           toggle       
(parted) rm 
Partition number? 1                                                       
(parted) print                                                            
Model: AVAGO HW-SAS3508 (scsi)
Disk /dev/sdd: 959GB
Sector size (logical/physical): 512B/4096B
Partition Table: gpt
Disk Flags: 

Number  Start  End  Size  File system  Name  Flags

(parted) mkpart primary 2048s 100%
(parted) print                                                            
Model: AVAGO HW-SAS3508 (scsi)
Disk /dev/sdd: 959GB
Sector size (logical/physical): 512B/4096B
Partition Table: gpt
Disk Flags: 

Number  Start   End    Size   File system  Name     Flags
 1      1049kB  959GB  959GB  ext4         primary

(parted) quit                                                             
Information: You may need to update /etc/fstab.

此時的/dev/sdd狀態不對，正常的情況下，file system不應該顯示ext4，我嘗試刪除分區重建，還是沒有成功，后來決定直接強制創建pv，看看能不能成功，看接下來的一步

# pvcreate /dev/sdd1 --uuid 'd3vWoV-zMwG-U6bg-X1Yz-T929-jVFd-
2QTifo' --restorefile /etc/lvm/backup/vg1 
  Couldn't find device with uuid d3vWoV-zMwG-U6bg-X1Yz-T929-jVFd-2QTifo.
  WARNING: Device for PV d3vWoV-zMwG-U6bg-X1Yz-T929-jVFd-2QTifo not found or rejected by a filter.
WARNING: ext4 signature detected on /dev/sdd1 at offset 1080. Wipe it? [y/n]: y
  Wiping ext4 signature on /dev/sdd1. 　　　　　　這里清除了上面的ext4，所以操作是成功了的
  Physical volume "/dev/sdd1" successfully created.

# vgcfgrestore -f /etc/lvm/backup/vg1 vg1
Restored volume group vg1

根據系統配置文件重建vg1

# vgscan
Reading volume groups from cache.
Found volume group "sys" using metadata type lvm2
Found volume group "vg1" using metadata type lvm2

此時已經沒有報錯了

# vgscan
Reading volume groups from cache.
Found volume group "sys" using metadata type lvm2
WARNING: Device for PV d3vWoV-zMwG-U6bg-X1Yz-T929-jVFd-2QTifo not found or rejected by a filter.
Found volume group "vg1" using metadata type lvm2

這是之前的狀態，有報錯，現在報錯已經消失，所以就可以接着進行了

# vgchange -ay vg1
1 logical volume(s) in volume group "vg1" now active

激活卷組vg1

# mkfs.xfs -m uuid=7m4Q3c-Gysj-MpxU-NeDt-76iI-sqiW-jF2qBr /dev/vg1/lv1 
Illegal value uuid=7m4Q3c-Gysj-MpxU-NeDt-76iI-sqiW-jF2qBr for -m uuid option
Usage: mkfs.xfs
/* blocksize */        [-b log=n|size=num]
/* metadata */        [-m crc=0|1,finobt=0|1,uuid=xxx]
/* data subvol */    [-d agcount=n,agsize=n,file,name=xxx,size=num,
                (sunit=value,swidth=value|su=num,sw=num|noalign),
                sectlog=n|sectsize=num
/* force overwrite */    [-f]
/* inode size */    [-i log=n|perblock=n|size=num,maxpct=n,attr=0|1|2,
                projid32bit=0|1]
/* no discard */    [-K]
/* log subvol */    [-l agnum=n,internal,size=num,logdev=xxx,version=n
                sunit=value|su=num,sectlog=n|sectsize=num,
                lazy-count=0|1]
/* label */        [-L label (maximum 12 characters)]
/* naming */        [-n log=n|size=num,version=2|ci,ftype=0|1]
/* no-op info only */    [-N]
/* prototype file */    [-p fname]
/* quiet */        [-q]
/* realtime subvol */    [-r extsize=num,size=num,rtdev=xxx]
/* sectorsize */    [-s log=n|size=num]
/* version */        [-V]
            devicename
<devicename> is required unless -d name=xxx is given.
<num> is xxx (bytes), xxxs (sectors), xxxb (fs blocks), xxxk (xxx KiB),
      xxxm (xxx MiB), xxxg (xxx GiB), xxxt (xxx TiB) or xxxp (xxx PiB).
<value> is xxx (512 byte blocks).

根據配置文件創建lv，創建完之后接着掛載

# mount /dev/vg1/lv1 /data/
mount: wrong fs type, bad option, bad superblock on /dev/mapper/vg1-lv1,
       missing codepage or helper program, or other error

       In some cases useful info is found in syslog - try
       dmesg | tail or so.

掛載失敗，應該是文件系統有故障，所以修復一下

# xfs_repair /dev/mapper/vg1-lv1 
Phase 1 - find and verify superblock...
        - reporting progress in intervals of 15 minutes
Phase 2 - using internal log
        - zero log...
Log inconsistent or not a log (last==0, first!=1)
empty log check failed
zero_log: cannot find log head/tail (xlog_find_tail=22)

fatal error -- ERROR: The log head and/or tail cannot be discovered. Attempt to mount the
filesystem to replay the log or use the -L option to destroy the log and
attempt a repair.

修復失敗加-L選項

# xfs_repair -L /dev/mapper/vg1-lv1 
Phase 1 - find and verify superblock...
        - reporting progress in intervals of 15 minutes
Phase 2 - using internal log
        - zero log...
Log inconsistent or not a log (last==0, first!=1)
empty log check failed
zero_log: cannot find log head/tail (xlog_find_tail=22)
        - scan filesystem freespace and inode maps...
bad magic number
bad magic number
bad magic number
bad magic number
bad magic number
bad magic number
bad magic number
bad magic number
Metadata CRC error detected at xfs_agf block 0x14eec9008/0x1000
Metadata CRC error detected at xfs_agf block 0x140f80a08/0x1000
Metadata CRC error detected at xfs_agf block 0x1250efe08/0x1000Metadata CRC error detected at xfs_agf block 0x133038408/0x1000

Metadata CRC error detected at xfs_agf block 0x1171a7808/0x1000
Metadata CRC error detected at xfs_agf block 0x10925f208/0x1000
Metadata CRC error detected at xfs_agf block 0xed3ce608/0x1000
Metadata CRC error detected at xfs_agf block 0xfb316c08/0x1000
Metadata CRC error detected at xfs_agi block 0x140f80a10/0x1000
Metadata CRC error detected at xfs_agi block 0x14eec9010/0x1000
Metadata CRC error detected at xfs_agi block 0x133038410/0x1000
Metadata CRC error detected at xfs_agi block 0x1250efe10/0x1000bad on-disk superblock 23 - bad magic number
primary/secondary superblock 23 conflict - AG superblock geometry info conflicts with filesystem geometry

。
。
。
。

        - 05:00:46: verify and correct link counts - 32 of 32 allocation groups done
Metadata corruption detected at xfs_dir3_block block 0xdf49148/0x1000
libxfs_writebufr: write verifer failed on xfs_dir3_block bno 0xdf49148/0x1000
Metadata corruption detected at xfs_dir3_block block 0x15ce11860/0x1000
libxfs_writebufr: write verifer failed on xfs_dir3_block bno 0x15ce11860/0x1000
Metadata corruption detected at xfs_dir3_block block 0xc35f5dc0/0x1000
libxfs_writebufr: write verifer failed on xfs_dir3_block bno 0xc35f5dc0/0x1000
Metadata corruption detected at xfs_dir3_block block 0x29dd93f8/0x1000
libxfs_writebufr: write verifer failed on xfs_dir3_block bno 0x29dd93f8/0x1000
Maximum metadata LSN (8:59832) is ahead of log (1:8).
Format log to cycle 11.
r (bulk) to free list!done

重新檢查文件系統

# xfs_repair /dev/mapper/vg1-lv1
Phase 1 - find and verify superblock...
        - reporting progress in intervals of 15 minutes
Phase 2 - using internal log
        - zero log...
        - scan filesystem freespace and inode maps...
        - 05:00:59: scanning filesystem freespace - 32 of 32 allocation groups done
        - found root inode chunk
Phase 3 - for each AG...
        - scan and clear agi unlinked lists...
        - 05:00:59: scanning agi unlinked lists - 32 of 32 allocation groups done
        - process known inodes and perform inode discovery...
        - agno = 30
        - agno = 0
        - agno = 15
        - agno = 31
        - agno = 16
        - agno = 17
        - agno = 18
        - agno = 19
        - agno = 20
        - agno = 1
        - agno = 21
        - agno = 22
        - agno = 23
        - agno = 24
        - agno = 25
        - agno = 26
        - agno = 27
        - agno = 28
        - agno = 29
        - agno = 2
        - agno = 3
        - agno = 4
        - agno = 5
        - agno = 6
        - agno = 7
        - agno = 8
        - agno = 9
        - agno = 10
        - agno = 11
        - agno = 12
        - agno = 13
        - agno = 14
        - 05:00:59: process known inodes and inode discovery - 7488 of 7488 inodes done
        - process newly discovered inodes...
        - 05:00:59: process newly discovered inodes - 32 of 32 allocation groups done
Phase 4 - check for duplicate blocks...
        - setting up duplicate extent list...
        - 05:00:59: setting up duplicate extent list - 32 of 32 allocation groups done
        - check for inodes claiming duplicate blocks...
        - agno = 0
        - agno = 1
        - agno = 5
        - agno = 6
        - agno = 10
        - agno = 12
        - agno = 15
        - agno = 17
        - agno = 18
        - agno = 23
        - agno = 26
        - agno = 29
        - agno = 3
        - agno = 14
        - agno = 8
        - agno = 16
        - agno = 9
        - agno = 19
        - agno = 21
        - agno = 20
        - agno = 2
        - agno = 22
        - agno = 24
        - agno = 25
        - agno = 11
        - agno = 30
        - agno = 4
        - agno = 27
        - agno = 28
        - agno = 31
        - agno = 13
        - agno = 7
        - 05:00:59: check for inodes claiming duplicate blocks - 7488 of 7488 inodes done
Phase 5 - rebuild AG headers and trees...
        - 05:00:59: rebuild AG headers and trees - 32 of 32 allocation groups done
        - reset superblock...
Phase 6 - check inode connectivity...
        - resetting contents of realtime bitmap and summary inodes
        - traversing filesystem ...
        - traversal finished ...
        - moving disconnected inodes to lost+found ...
Phase 7 - verify and correct link counts...
        - 05:00:59: verify and correct link counts - 32 of 32 allocation groups done
done

這次沒有報錯

# mount /dev/mapper/vg1-lv1 /data/

掛載也沒報錯，最后把/etc/fstab的注釋掉的信息#號去掉，但是數據丟了，只能從備節點恢復數據，總結邏輯卷有磁盤故障的時候，應該先從系統中通過命令踢出去故障pv，然后換磁盤，重做pv，加入卷組，目前還沒測試，下次有類似的故障，我會補上，看看按照正常的步驟會有什么現象

下次注意事項

1.首先備份邏輯卷的配置文件/etc/lvm/backup/vg1

做完之后，看看uuid怎么變化，配置文件怎么變化

2.下次可能用到的命令

vgreduce指令：從卷組中刪除物理卷

《Linux指令范例速查手冊》第11章磁盤管理，本章介紹的磁盤管理指令包括磁盤分區、磁盤引導和LVM邏輯卷管理等。本節為大家介紹vgreduce指令：從卷組中刪除物理卷。

作者：黃照鶴來源：清華大學出版社

11.28 vgreduce指令：從卷組中刪除物理卷

【語法】vgreduce [選項] [參數]

【功能介紹】vgreduce指令通過刪除LVM卷組中的物理卷來減少卷組容量。

【選項說明】

選項

功能

-a

如果命令行中沒有指定要刪除的

物理卷，則刪除所有的空物理卷

--removemissing

刪除卷組中丟失的物理卷，

使卷組恢復正常狀態

【參數說明】

參數	功能
卷組	指定要操作的卷組名稱
物理卷列表	指定要刪除的物理卷列表

【經驗技巧】不能刪除LVM卷組中剩余的***一個物理卷。

【示例362】輸出物理卷。具體步驟如下：

使用vgreduce指令從卷組"vg2000"中移除物理卷"/dev/sdb2"。在命令行中輸入下面的命令：

[root@hn ~]# vgreduce vg2000 /dev/sdb2
#將物理卷"/dev/sdb2"從卷組"vg2000"中刪除

輸出信息如下：

Removed "/dev/sdb2" from volume group "vg2000"

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Kafka服務不可用(宕機)問題踩坑記 VC斷點不可用的問題 centos6 升級pip后導致pip不可用記一次因jenkins重啟導致的項目全部丟失的問題手抖把Python2.7卸載了,導致了自己的yum不可用以及yum因python版本無法使用的問題記一次自動恢復的支付故障記一次ceph集群的嚴重故障記一次Postgres CPU爆滿故障記一次mysql故障恢復 android開發學習 ------- 關於getSupportFragmentManager()不可用的問題