今天來學習下R中字符串處理操作,主要是stringr包中的字符串處理函數的用法。
先導入stringr包,library(stringr),require(stringr),或者stringr::函數名;這幾種方式都行。
一、檢測是否匹配
我們先定義一個字符串和變量,在此基礎上演示各個函數基本用法。
1 library(stringr) 2 animal<-c("cow","dog","sheep","goat","pig","monkey","cat","cat") 3 str1<-"I love cat, cat cat !" 4 str2<-"lovelovelove" 5 6 str_detect(animal,"cow") #匹配到指定字符串返回True,否則返回False 7 [1] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 8 9 str_detect(str1,"love") 10 [1] TRUE 11 12 str_which(animal,"dog") #返回指定字符串位置 13 [1] 2 14 15 str_which(animal,"cat") 16 [1] 7 8 17 18 str_which(str2,"love") #連續重復字符只返回第一個 19 [1] 1 20 21 str_count(animal,"cat") #返回匹配次數 22 [1] 0 0 0 0 0 0 1 1 23 24 str_count(str1,"cat") 25 [1] 3 26 27 str_locate(animal,"cat") #返回匹配起始位置 28 start end 29 [1,] NA NA 30 [2,] NA NA 31 [3,] NA NA 32 [4,] NA NA 33 [5,] NA NA 34 [6,] NA NA 35 [7,] 1 3 36 [8,] 1 3 37 38 str_locate(str1,"cat") 39 start end 40 [1,] 8 10 41 42 str_locate(str2,"love") #連續重復字符值返回第一個字符起始位置 43 start end 44 [1,] 1 4 45
二、子串提取
1 str_sub(str1,1,3) # 后面兩個參數為起始,結束位置 2 [1] "I l" 3 4 str_sub(str1,1) # 可以只跟起始位置,默認到結束位置 5 [1] "I love cat, cat cat !" 6 7 str_sub(str1,3) 8 [1] "love cat, cat cat !" 9 10 str_sub(str1,-5) #位置還可以為負數 11 [1] "cat !" 12 13 str_sub(str1,-5,-1) 14 [1] "cat !" 15 16 str_subset(str1,"a") #匹配到指定字符就返回整個字符串 17 [1] "I love cat, cat cat !" 18 19 str_subset(str1,"x") #匹配不到則返回空 20 character(0) 21 22 str_extract(str1,"cat") #返回第一個匹配到字符串 23 [1] "cat" 24 str_extract(str1,"ca") 25 [1] "ca" 26 27 str_extract_all(str1,"cat") #返回所有匹配到字符串 列表形式返回 28 [[1]] 29 [1] "cat" "cat" "cat" 30 31 str_extract_all(str1,"[aoe]") #返回所有匹配到字符串 列表形式返回 32 [[1]] 33 [1] "o" "e" "a" "a" "a" 34 35 str_match(str1,"cat") #返回第一個匹配到字符串 矩陣形式返回 36 [,1] 37 [1,] "cat" 38 39 str_match_all(str1,"cat") #返回所有匹配到字符串 矩陣形式返回 40 [[1]] 41 [,1] 42 [1,] "cat" 43 [2,] "cat" 44 [3,] "cat" 45 46 str_match_all(str2,"love") 47 [[1]] 48 [,1] 49 [1,] "love" 50 [2,] "love" 51 [3,] "love" 52 53 str_match(str2,"love") 54 [,1] 55 [1,] "love" 56 57 str_match_all(str1,"(I|cat)") #可以多個匹配,不過這個返回結果我沒看懂 58 [[1]] 59 [,1] [,2] 60 [1,] "I" "I" 61 [2,] "cat" "cat" 62 [3,] "cat" "cat" 63 [4,] "cat" "cat"
三、字符串長度處理
1 str_length(str2) # 返回字符串長度 2 [1] 12 3 4 str_length("good job !") # 空格也算一個字符長度 5 [1] 10 6 7 str_trunc(str2,4) #指定字符串替換成替他字符, 8 [1] "l..." 9 10 str_trunc(str2,4,ellipsis = "*") #ellipsis 指定替換符 11 [1] "lov*" 12 13 str_trunc(str2,width = 8,ellipsis = "#") #width指定長度,此處指前8個字符 14 [1] "lovelov#" 15 16 str_trunc(str2,width = 8,side = c("left"),ellipsis = "#") # side指定方向(right,center,left) 17 [1] "#ovelove" 18 19 str_trim("sssss\n") # 去掉字符串首尾空字符,換行,空格等;字符串內部空字符無法去除 20 [1] "sssss" 21 str_trim(" sssss\n") 22 [1] "sssss"
四、字符串替換
1 str1 2 [1] "I love cat, cat cat !" 3 4 str_sub(str1,1,6) #提取子串 5 [1] "I love" 6 7 str_sub(str1,1,6)<-"she love" #子串替換 8 str1 9 [1] "she love cat, cat cat !" 10 11 str_sub(animal,1,1)<-"F" #向量替換也可以 12 animal 13 [1] "Fow" "Fog" "Fheep" "Foat" "Fig" "Fonkey" "Fat" 14 [8] "Fat" 15 16 str1<-"I love cat, cat cat !" 17 18 str_replace(str1,"cat","dog") #替換第一個匹配項 19 [1] "I love dog, cat cat !" 20 21 str_replace_all(str1,"cat","dog") # 替換所有匹配項 22 [1] "I love dog, dog dog !" 23 24 str_to_lower(str1) # 全部轉為小寫字母 25 [1] "i love cat, cat cat !" 26 27 str_to_upper(str1) # 全部轉為大寫字母 28 [1] "I LOVE CAT, CAT CAT !" 29 30 str_to_title(str1) # 單詞首字母轉為大寫 31 [1] "I Love Cat, Cat Cat !" 32 33 str_to_title(str2) 34 [1] "Lovelovelove" 35
五、字符串分割和連接
1 str_c(str1,str2,sep="+") # 字符串連接 2 [1] "I love cat, cat cat !+lovelovelove" 3 4 str_c(animal,str2,sep="+") #向量一次連接字符串 5 [1] "Fow+lovelovelove" "Fog+lovelovelove" "Fheep+lovelovelove" 6 [4] "Foat+lovelovelove" "Fig+lovelovelove" "Fonkey+lovelovelove" 7 [7] "Fat+lovelovelove" "Fat+lovelovelove" 8 9 str_c(animal,sep="",collapse = "+") # 向量字符串連接 10 [1] "Fow+Fog+Fheep+Foat+Fig+Fonkey+Fat+Fat" 11 12 str_dup(str1,2) #字符串重復,數字代表次數 13 [1] "I love cat, cat cat !I love cat, cat cat !" 14 str_dup(str2,3) 15 [1] "lovelovelovelovelovelovelovelovelove" 16 17 str_split_fixed(animal,"",n=2) #分割字符串,分隔符,n=分割份數,返回矩陣 18 [,1] [,2] 19 [1,] "F" "ow" 20 [2,] "F" "og" 21 [3,] "F" "heep" 22 [4,] "F" "oat" 23 [5,] "F" "ig" 24 [6,] "F" "onkey" 25 [7,] "F" "at" 26 [8,] "F" "at" 27 28 str_split_fixed(str2,"",n=4) 29 [,1] [,2] [,3] [,4] 30 [1,] "l" "o" "v" "elovelove" 31 32 str_split(str2,"",4) # #分割字符串,分隔符,n=分割份數,返回列表 33 [[1]] 34 [1] "l" "o" "v" "elovelove" 35 36 str_glue("pi is {str1}") # 字符串連接變量,{}花括號內是系統變量 37 pi is I love cat, cat cat ! 38 39 str_glue("pi is {pi}") 40 pi is 3.14159265358979 41 42 str_glue("log2(8) is {log2(8)}") 43 log2(8) is 3 44 45 str_glue_data(mtcars, "{rownames(mtcars)} has {hp} hp") #數據框或列表對應行連接字符串 46 Mazda RX4 has 110 hp 47 Mazda RX4 Wag has 110 hp 48 Datsun 710 has 93 hp 49 Hornet 4 Drive has 110 hp 50 Hornet Sportabout has 175 hp 51 Valiant has 105 hp 52 53 str_glue_data(mtcars, "{rownames(mtcars)} has {hp*1000} hp") # 話可以做相應計算 54 Mazda RX4 has 110000 hp 55 Mazda RX4 Wag has 110000 hp 56 Datsun 710 has 93000 hp 57 Hornet 4 Drive has 110000 hp 58 59 str_glue_data(mtcars, "{rownames(mtcars)} has {substr(wt,1,2)} wt") # 子串分割 60 Mazda RX4 has 2. wt 61 Mazda RX4 Wag has 2. wt 62 Datsun 710 has 2. wt 63 Hornet 4 Drive has 3. wt
六、字符串排序
1 str2 2 [1] "lovelovelove" 3 str_order(str2,decreasing = T) # 返回字符串下標 4 [1] 1 5 6 animal 7 [1] "Fow" "Fog" "Fheep" "Foat" "Fig" "Fonkey" "Fat" 8 [8] "Fat" 9 animal[str_order(animal,decreasing = T)] 10 [1] "Fow" "Fonkey" "Fog" "Foat" "Fig" "Fheep" "Fat" 11 [8] "Fat" 12 13 animal 14 [1] "Fow" "Fog" "Fheep" "Foat" "Fig" "Fonkey" "Fat" 15 [8] "Fat" 16 str_sort(animal) #直接對向量字符串排序 17 [1] "Fat" "Fat" "Fheep" "Fig" "Foat" "Fog" "Fonkey" 18 [8] "Fow" 19