[讀書筆記]機器學習：實用案例解析（1）

本文轉載自查看原文 2016-06-05 22:04 1059 讀書筆記/ R語言/ ML_for_Hackers

第1章使用R語言

#machine learing for heckers
#chapter 1

library(ggplot2)
library(plyr)

#.tsv文件用制表符進行分割
#字符串默認為factor類型，因此stringsAsFactors置FALSE防止轉換
#header置FALSE防止將第一行當做表頭
#定義空字符串為NA：na.strings = ""

ufo <- read.delim("ML_for_Hackers/01-Introduction/data/ufo/ufo_awesome.tsv", 
                  sep = "\t", stringsAsFactors = FALSE, header = FALSE, 
                  na.strings = "")

查看數據集前6行

tail() 可查看后6行

#names()既可以寫入列名，也可以讀取列名

names(ufo) <- c("DateOccurred", "DateReported", "Location", 
                "ShortDescription", "Duration", "LongDescription")

#as.Date用法，可以將字符串轉為Date對象，具體格式可以設定，參考help

#錯誤：輸入過長，考慮有畸形數據
#畸形數據處理

head(ufo[which(nchar(ufo$DateOccurred) != 8 
               | nchar(ufo$DateReported) != 8), 1])

#新建向量，布爾值F為不符合要求的行
#計數不符要求的行數，並只留下符合要求的行

good.rows <- ifelse(nchar(ufo$DateOccurred) != 8 
                    | nchar(ufo$DateReported) != 8, FALSE, TRUE)
length(which(!good.rows))
ufo <- ufo[good.rows, ]

　　運行結果是731條，而書上是371條，應該是書上有誤

#轉換

ufo$DateOccurred <- as.Date(ufo$DateOccurred, format = "%Y%m%d")
ufo$DateReported <- as.Date(ufo$DateReported, format = "%Y%m%d")

#輸入為字符串，進行目擊地點清洗
#strsplit用於分割字符串，在遇到不符條件的字符串會拋出異常，由tryCatch捕獲，並返回缺失
#gsub將原始數據中的空格去掉（通過替換）
#條件語句用於檢查是否多個逗號，返回缺失

get.location <- function(l){
  split.location <- tryCatch(strsplit(l, ",")[[1]], error = function(e) return(c(NA, NA)))
  clean.location <- gsub("^ ", "", split.location)
  if(length(clean.location) > 2){
    return(c(NA, NA))
  }
  else{
    return(clean.location)
  }
}

#lapply(list-apply)將function逐一用到向量元素上，並返回鏈表（list）

city.state <- lapply(ufo$Location, get.location)

#將list轉換成matrix
#do.call在一個list上執行一個函數調用
#transform函數給ufo創建兩個新列，tolower函數將大寫變小寫，為了統一格式

location.matrix <- do.call(rbind, city.state)
ufo <- transform(ufo, USCity = location.matrix[, 1], USState = tolower(location.matrix[, 2]), 
                 stringsAsFactors = FALSE)

#識別非美國地名，並置為NA

us.states <- c("ak", "al", "ar", "az", "ca", "co", "ct", "de", "fl", "ga", "hi", "ia", "id", 
               "il", "in", "ks", "ky", "la", "ma", "md", "me", "mi", "mn", "mo", "ms", "mt", 
               "nc", "nd", "ne", "nh", "nj", "nm", "nv", "ny", "oh", "ok", "or", "pa", "ri", 
               "sc", "sd", "tn", "tx", "ut", "va", "vt", "wa", "wi", "wv", "wy")
ufo$USState <- us.states[match(ufo$USState, us.states)]
ufo$USCity[is.na(ufo$USState)] <- NA

#只留下美國境內的記錄

ufo.us <- subset(ufo, !is.na(USState))

#對時間維度進行分析：
#預處理：對時間范圍進行概述

summary(ufo.us$DateOccurred)
quick.hist <- ggplot(ufo.us, aes(x = DateOccurred)) + geom_histogram() + scale_x_date(date_breaks = "50 years")
print(quick.hist)

#取出1990年后的數據並作圖

ufo.us <- subset(ufo.us, DateOccurred >= as.Date("1990-01-01"))
quick.hist.new <- ggplot(ufo.us, aes(x = DateOccurred)) + geom_histogram() + scale_x_date(date_breaks = "50 years")
print(quick.hist.new)

#統計每個年-月的目擊個數
#時間信息轉化為以月為單位，每個月的目擊次數的數據框
#產生一個以月為單位的序列，包含了所有月信息，並與地點相結合生成數據框

ufo.us$YearMonth <- strftime(ufo.us$DateOccurred, format = "%Y-%m")
sightings.counts <- ddply(ufo.us, .(USState, YearMonth), nrow)
date.range <- seq.Date(from = as.Date(min(ufo.us$DateOccurred)), 
                       to = as.Date(max(ufo.us$DateOccurred)), by = "month")
date.strings <- strftime(date.range, "%Y-%m")
states.dates <- lapply(us.states, function(s) cbind(s, date.strings))
states.dates <- data.frame(do.call(rbind, states.dates), stringsAsFactors = FALSE)

#將兩個數據框合並，merge函數，傳入兩個數據框，可以將相同的列合並，by.x和by.y指定列名
#all置為TRUE可以將未匹配處填充為NA
#進一步將all.sithtings細節優化，包括缺失值置0和轉化變量類型

all.sightings <- merge(states.dates, sightings.counts, 
                       by.x = c("s", "date.strings"), 
                       by.y = c("USState", "YearMonth"), all = TRUE)
names(all.sightings) <- c("State", "YearMonth", "Sightings")
all.sightings$Sightings[is.na(all.sightings$Sightings)] <- 0
all.sightings$YearMonth <- as.Date(rep(date.range, length(us.states)))
all.sightings$State <- as.factor(toupper(all.sightings$State))

#分析數據
#geom_line表示曲線圖，facet_wrap用於創建分塊繪制的圖形，並使用分類變量State
#theme_bw設定了圖形背景主題
#scale_color_manual定義第二行中字符串"darkblue"的值，這個值相當於"darkblue"對應的值

state.plot <- ggplot(all.sightings, aes(x = YearMonth, y = Sightings)) + 
  geom_line(aes(color = "darkblue")) + 
  facet_wrap(~State, nrow = 10, ncol = 5) + 
  theme_bw() + 
  scale_color_manual(values = c("darkblue" = "darkblue"), guide = "none") + 
  xlab("Time") + 
  ylab("Number of Sightings") + 
  ggtitle("Number of UFO sightings by Month-Year and U.S. State (1990-2010)")
print(state.plot)

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 《機器學習：實用案例解析》，讀書筆記機器學習（周志華）讀書筆記【讀書筆記與思考】Andrew 機器學習課程筆記《機器學習》(周志華)西瓜書讀書筆記(完結) PRML讀書筆記——機器學習導論機器學習實用案例解析（1）使用R語言機器學習讀書筆記03 聚類(K-Means) 機器學習實戰 - 讀書筆記(06) – SVM支持向量機機器學習實戰:基於Scikit-Learn和TensorFlow 讀書筆記第6章決策樹《圖解機器學習-杉山將著》讀書筆記---CH3