Part 3: Top 50 ggplot2 Visualizations - The Master List,
結合進階1、2內容構建圖形
有效的圖形是:
- 不扭曲事實 傳遞正確的信息
- 簡潔優雅
- 美觀是為了凸顯信息 而不要蓋過信息
- 不超載信息
1 相關性圖
散點圖
最常用
# install.packages("ggplot2") # load package and data options(scipen=999) # turn-off scientific notation like 1e+48 library(ggplot2) theme_set(theme_bw()) # pre-set the bw theme. data("midwest", package = "ggplot2") # midwest <- read.csv("http://goo.gl/G1K41K") # bkup data source # Scatterplot gg <- ggplot(midwest, aes(x=area, y=poptotal)) + geom_point(aes(col=state, size=popdensity)) + geom_smooth(method="loess", se=F) + xlim(c(0, 0.1)) + ylim(c(0, 500000)) + labs(subtitle="Area Vs Population", y="Population", x="Area", title="Scatterplot", caption = "Source: midwest") plot(gg)
帶圈的散點圖
展示結果時,在圖中圈出特定的點有注意吸引注意力,使用ggalt包中的geom_encircle() 可以方便實現
在 geom_encircle()
中把數據 data
設為只包括興趣點的數據框 並且 你可以 expand
曲線使其剛好繞過點的外圍. 曲線的顏色 粗細也能被修改
# install 'ggalt' pkg # devtools::install_github("hrbrmstr/ggalt") options(scipen = 999) library(ggplot2) library(ggalt) midwest_select <- midwest[midwest$poptotal > 350000 & midwest$poptotal <= 500000 & midwest$area > 0.01 & midwest$area < 0.1, ] # Plot ggplot(midwest, aes(x=area, y=poptotal)) + geom_point(aes(col=state, size=popdensity)) + # draw points geom_smooth(method="loess", se=F) + xlim(c(0, 0.1)) + ylim(c(0, 500000)) + # draw smoothing line geom_encircle(aes(x=area, y=poptotal), data=midwest_select, color="red", size=2, expand=0.08) + # encircle labs(subtitle="Area Vs Population", y="Population", x="Area", title="Scatterplot + Encircle", caption="Source: midwest")
抖動圖
# load package and data library(ggplot2) data(mpg, package="ggplot2") # alternate source: "http://goo.gl/uEeRGu") theme_set(theme_bw()) # pre-set the bw theme. g <- ggplot(mpg, aes(cty, hwy)) # Scatterplot g + geom_point() + geom_smooth(method="lm", se=F) + labs(subtitle="mpg: city vs highway mileage", y="hwy", x="cty", title="Scatterplot with overlapping points", caption="Source: midwest")
上圖中其實有很多點是重合的 原始數據是整數
dim(mpg)
用 jitter_geom()
畫抖動圖 重合的點在原先的位置基於一定閾值范圍(width)隨機抖動
library(ggplot2) data(mpg, package="ggplot2") # mpg <- read.csv("http://goo.gl/uEeRGu") # Scatterplot theme_set(theme_bw()) # pre-set the bw theme. g <- ggplot(mpg, aes(cty, hwy)) g + geom_jitter(width = .5, size=1) + labs(subtitle="mpg: city vs highway mileage", y="hwy", x="cty", title="Jittered Points")
Counts Chart
處理數據點重合也可以用 counts chart. 重合點越多 圈就越大
# load package and data library(ggplot2) data(mpg, package="ggplot2") # mpg <- read.csv("http://goo.gl/uEeRGu") # Scatterplot theme_set(theme_bw()) # pre-set the bw theme. g <- ggplot(mpg, aes(cty, hwy)) g + geom_count(col="tomato3", show.legend=F) + labs(subtitle="mpg: city vs highway mileage", y="hwy", x="cty", title="Counts Plot")
氣泡圖
比較兩個變量之間的關系時,如果數據還包括以下:
- A Categorical variable (by changing the color) and 一個分類變量
- Another continuous variable (by changing the size of points). 一個連續變量
如果你有四維數據 使用氣泡圖更好 其中兩個變量是數值 numeric (X and Y) 一個是類型 categorical (color) 另一個是數值型 numeric variable (size).
提供不同群組間更好的可視化比較效果
# load package and data library(ggplot2) data(mpg, package="ggplot2") # mpg <- read.csv("http://goo.gl/uEeRGu") mpg_select <- mpg[mpg$manufacturer %in% c("audi", "ford", "honda", "hyundai"), ] # Scatterplot theme_set(theme_bw()) # pre-set the bw theme. g <- ggplot(mpg_select, aes(displ, cty)) + labs(subtitle="mpg: Displacement vs City Mileage", title="Bubble chart") g + geom_jitter(aes(col=manufacturer, size=hwy)) + geom_smooth(aes(col=manufacturer), method="lm", se=F)
Animated Bubble chart 動畫氣泡圖
使用 gganimate
包實現. 其余和氣泡圖一樣 但是需要在時間維度上展示
關鍵是設置 aes(frame)
到你想展示的列變量
使用 gganimate()設置時間間隔
interval
.動畫化
# Source: https://github.com/dgrtwo/gganimate # install.packages("cowplot") # a gganimate dependency # devtools::install_github("dgrtwo/gganimate") library(ggplot2) library(gganimate) library(gapminder) theme_set(theme_bw()) # pre-set the bw theme. g <- ggplot(gapminder, aes(gdpPercap, lifeExp, size = pop, frame = year)) + geom_point() + geom_smooth(aes(group = year), method = "lm", show.legend = FALSE) + facet_wrap(~continent, scales = "free") + scale_x_log10() # convert to log scale gganimate(g, interval=0.2)
Error: It appears that you are trying to use the old API, which has been deprecated.(或許會出現這個問題 或許可以換一個數據集試一試)
Marginal Histogram / Boxplot 邊緣直方圖 箱形圖
可以在一張圖中展示關系和分布 在散點圖周圍有 X and Y 變量的直方圖
使用 ggMarginal()
函數 來自‘ggExtra
’ 包 除去 histogram 也可以選則
boxplot
or density
圖 通過設置 type 選項
# load package and data library(ggplot2) library(ggExtra) data(mpg, package="ggplot2") # mpg <- read.csv("http://goo.gl/uEeRGu") # Scatterplot theme_set(theme_bw()) # pre-set the bw theme. mpg_select <- mpg[mpg$hwy >= 35 & mpg$cty > 27, ] g <- ggplot(mpg, aes(cty, hwy)) + geom_count() + geom_smooth(method="lm", se=F) ggMarginal(g, type = "histogram", fill="transparent") ggMarginal(g, type = "boxplot", fill="transparent") # ggMarginal(g, type = "density", fill="transparent")
Correlogram
同一數據框中多個連續變量之間的相關關系 使用 ggcorrplot
包
# devtools::install_github("kassambara/ggcorrplot")
library(ggplot2)
library(ggcorrplot)
# Correlation matrix
data(mtcars)
corr <- round(cor(mtcars), 1)#生成相關系數矩陣
# Plot
ggcorrplot(corr, hc.order = TRUE,
type = "lower",
lab = TRUE,
lab_size = 3,
method="circle",
colors = c("tomato2", "white", "springgreen3"), #設置顏色表
title="Correlogram of mtcars",
ggtheme=theme_bw)
2. Deviation 偏差圖
分歧條形圖 Diverging Bars
geom_bar()
可以用來畫直方圖或者條形圖
geom_bar()
默認設置 stat
來計數 這意味着當你只提供一個連續變量時,畫出的圖是直方圖
為了做條形圖
- Set
stat=identity
- Provide both
x
andy
insideaes()
where,x
is eithercharacter
orfactor
andy
is numeric.
分歧條形圖要求分類變量只有兩個類別 在連續變量的特定閾值改變類型
library(ggplot2) theme_set(theme_bw()) # Data Prep data("mtcars") # load data mtcars$`car name` <- rownames(mtcars) # create new column for car names mtcars$mpg_z <- round((mtcars$mpg - mean(mtcars$mpg))/sd(mtcars$mpg), 2) # compute normalized mpg 計算歸一化 mtcars$mpg_type <- ifelse(mtcars$mpg_z < 0, "below", "above") # above / below avg flag 分類 mtcars <- mtcars[order(mtcars$mpg_z), ] # sort 排序 mtcars$`car name` <- factor(mtcars$`car name`, levels = mtcars$`car name`) # convert to factor to retain sorted order in plot. # Diverging Barcharts ggplot(mtcars, aes(x=`car name`, y=mpg_z, label=mpg_z)) + geom_bar(stat='identity', aes(fill=mpg_type), width=.5) + scale_fill_manual(name="Mileage", labels = c("Above Average", "Below Average"), values = c("above"="#00ba38", "below"="#f8766d")) + labs(subtitle="Normalised mileage from 'mtcars'", title= "Diverging Bars") + coord_flip()
分歧棒棒糖圖 Diverging Lollipop Chart
使用 geom_point
and geom_segment
library(ggplot2) theme_set(theme_bw()) ggplot(mtcars, aes(x=`car name`, y=mpg_z, label=mpg_z)) + geom_point(stat='identity', fill="black", size=6) + geom_segment(aes(y = 0, x = `car name`, yend = mpg_z, xend = `car name`), color = "black") + geom_text(color="white", size=2) + labs(title="Diverging Lollipop Chart", subtitle="Normalized mileage from 'mtcars': Lollipop") + ylim(-2.5, 2.5) + coord_flip()
分歧點圖 Diverging Dot Plot
library(ggplot2) theme_set(theme_bw()) # Plot ggplot(mtcars, aes(x=`car name`, y=mpg_z, label=mpg_z)) + geom_point(stat='identity', aes(col=mpg_type), size=6) + scale_color_manual(name="Mileage", labels = c("Above Average", "Below Average"), values = c("above"="#00ba38", "below"="#f8766d")) + geom_text(color="white", size=2) + labs(title="Diverging Dot Plot", subtitle="Normalized mileage from 'mtcars': Dotplot") + ylim(-2.5, 2.5) + coord_flip()
區域圖 Area Chart
%returns or %change 數據 geom_area()
library(ggplot2) library(quantmod) data("economics", package = "ggplot2") # Compute % Returns economics$returns_perc <- c(0, diff(economics$psavert)/economics$psavert[-length(economics$psavert)]) # Create break points and labels for axis ticks brks <- economics$date[seq(1, length(economics$date), 12)] lbls <- lubridate::year(economics$date[seq(1, length(economics$date), 12)]) # Plot ggplot(economics[1:100, ], aes(date, returns_perc)) + geom_area() + scale_x_date(breaks=brks, labels=lbls) + theme(axis.text.x = element_text(angle=90)) + labs(title="Area Chart", subtitle = "Perc Returns for Personal Savings", y="% Returns for Personal savings", caption="Source: economics")
3 排序圖
Ordered Bar Chart 排序條形圖
通過y軸變量排序 X 軸變量需要是類型變量或者被轉換為 a factor.
Let’s plot the mean city mileage for each manufacturer from mpg
dataset.
# Prepare data: group mean city mileage by manufacturer. 准備數據 cty_mpg <- aggregate(mpg$cty, by=list(mpg$manufacturer), FUN=mean) # aggregate colnames(cty_mpg) <- c("make", "mileage") # change column names cty_mpg <- cty_mpg[order(cty_mpg$mileage), ] # sort cty_mpg$make <- factor(cty_mpg$make, levels = cty_mpg$make) # to retain the order in plot. head(cty_mpg, 4) #> make mileage #> 9 lincoln 11.33333 #> 8 land rover 11.50000 #> 3 dodge 13.13514 #> 10 mercury 13.25000
x軸已經是一個factor變量
library(ggplot2) theme_set(theme_bw()) # Draw plot ggplot(cty_mpg, aes(x=make, y=mileage)) + geom_bar(stat="identity", width=.5, fill="tomato3") + labs(title="Ordered Bar Chart", subtitle="Make Vs Avg. Mileage", caption="source: mpg") + theme(axis.text.x = element_text(angle=65, vjust=0.6))
Lollipop Chart
重點在值上 好看現代
library(ggplot2) theme_set(theme_bw()) # Plot ggplot(cty_mpg, aes(x=make, y=mileage)) + geom_point(size=3) + geom_segment(aes(x=make, xend=make, y=0, yend=mileage)) + labs(title="Lollipop Chart", subtitle="Make Vs Avg. Mileage", caption="source: mpg") + theme(axis.text.x = element_text(angle=65, vjust=0.6))
Dot Plot
It emphasizes more on the rank ordering of items with respect to actual values and how far apart are the entities with respect to each other.
library(ggplot2) library(scales) theme_set(theme_classic()) # Plot ggplot(cty_mpg, aes(x=make, y=mileage)) + geom_point(col="tomato2", size=3) + # Draw points geom_segment(aes(x=make, xend=make, y=min(mileage), yend=max(mileage)), linetype="dashed", size=0.1) + # Draw dashed lines labs(title="Dot Plot", subtitle="Make Vs Avg. Mileage", caption="source: mpg") + coord_flip()
Slope Chart 坡度圖
Slope charts are an excellent way of comparing the positional placements between 2 points on time.比較時間上兩點的位置
目前沒有內置函數來構建 以下代碼是一個啟示:
library(ggplot2) library(scales) theme_set(theme_classic()) # prep data df <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/gdppercap.csv") colnames(df) <- c("continent", "1952", "1957") left_label <- paste(df$continent, round(df$`1952`),sep=", ") right_label <- paste(df$continent, round(df$`1957`),sep=", ") df$class <- ifelse((df$`1957` - df$`1952`) < 0, "red", "green") # Plot p <- ggplot(df) + geom_segment(aes(x=1, xend=2, y=`1952`, yend=`1957`, col=class), size=.75, show.legend=F) + geom_vline(xintercept=1, linetype="dashed", size=.1) + geom_vline(xintercept=2, linetype="dashed", size=.1) + scale_color_manual(labels = c("Up", "Down"), values = c("green"="#00ba38", "red"="#f8766d")) + # color of lines labs(x="", y="Mean GdpPerCap") + # Axis labels xlim(.5, 2.5) + ylim(0,(1.1*(max(df$`1952`, df$`1957`)))) # X and Y axis limits # Add texts p <- p + geom_text(label=left_label, y=df$`1952`, x=rep(1, NROW(df)), hjust=1.1, size=3.5) p <- p + geom_text(label=right_label, y=df$`1957`, x=rep(2, NROW(df)), hjust=-0.1, size=3.5) p <- p + geom_text(label="Time 1", x=1, y=1.1*(max(df$`1952`, df$`1957`)), hjust=1.2, size=5) # title p <- p + geom_text(label="Time 2", x=2, y=1.1*(max(df$`1952`, df$`1957`)), hjust=-0.1, size=5) # title # Minify theme p + theme(panel.background = element_blank(), panel.grid = element_blank(), axis.ticks = element_blank(), axis.text.x = element_blank(), panel.border = element_blank(), plot.margin = unit(c(1,2,1,2), "cm"))
Dumbbell Plot 啞鈴圖
1. Visualize relative positions (like growth and decline) between two points in time.
2. Compare distance between two categories.比較兩類間距離
Y 變量應該是 a factor and the levels of the factor variable should be in the same order as it should appear in the plot.
# devtools::install_github("hrbrmstr/ggalt") library(ggplot2) library(ggalt) theme_set(theme_classic()) health <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/health.csv") health$Area <- factor(health$Area, levels=as.character(health$Area)) # for right ordering of the dumbells # health$Area <- factor(health$Area) gg <- ggplot(health, aes(x=pct_2013, xend=pct_2014, y=Area, group=Area)) + geom_dumbbell(color="#a3c4dc", size=0.75, point.colour.l="#0e668b") + scale_x_continuous(label=percent) + labs(x=NULL, y=NULL, title="Dumbbell Chart", subtitle="Pct Change: 2013 vs 2014", caption="Source: https://github.com/hrbrmstr/ggalt") + theme(plot.title = element_text(hjust=0.5, face="bold"), plot.background=element_rect(fill="#f7f7f7"), panel.background=element_rect(fill="#f7f7f7"), panel.grid.minor=element_blank(), panel.grid.major.y=element_blank(), panel.grid.major.x=element_line(), axis.ticks=element_blank(), legend.position="top", panel.border=element_blank()) plot(gg)
4. 分布圖
有很多數據點 想研究怎么分布
直方圖
只有一個變量 geom_bar()會計算每種變量的數量
stat=identity
選項一定要設置 而且x和y軸的變量都要提供
Histogram on a continuous variable 基於連續變量的直方圖
使用 geom_bar()
或者 geom_histogram()生成
使用geom_histogram()
時 使用 bins
選項控制條形的個數.設置 binwidth控制bin的范圍寬度
geom_histogram
同時控制bins
和 binwidth 更常用
library(ggplot2) theme_set(theme_classic()) # Histogram on a Continuous (Numeric) Variable g <- ggplot(mpg, aes(displ)) + scale_fill_brewer(palette = "Spectral") g + geom_histogram(aes(fill=class), binwidth = .1, col="black", size=.1) + # change binwidth labs(title="Histogram with Auto Binning", subtitle="Engine Displacement across Vehicle Classes") g + geom_histogram(aes(fill=class), bins=5, col="black", size=.1) + # change number of bins labs(title="Histogram with Fixed Bins", subtitle="Engine Displacement across Vehicle Classes")
Histogram on a categorical variable 基於分類變量的直方圖
生成不同種類的頻率圖 通過調整 width
, 你可以調整條形的厚度
library(ggplot2) theme_set(theme_classic()) # Histogram on a Categorical variable g <- ggplot(mpg, aes(manufacturer)) g + geom_bar(aes(fill=class), width = 0.5) + theme(axis.text.x = element_text(angle=65, vjust=0.6)) + labs(title="Histogram on Categorical Variable", subtitle="Manufacturer across Vehicle Classes")
Density plot 密度圖
library(ggplot2) theme_set(theme_classic()) # Plot g <- ggplot(mpg, aes(cty)) g + geom_density(aes(fill=factor(cyl)), alpha=0.8) + labs(title="Density plot", subtitle="City Mileage Grouped by Number of cylinders", caption="Source: mpg", x="City Mileage", fill="# Cylinders")
Box Plot 箱形圖
中位數 范圍 異常值
箱子頂部是 75% 底部是 25% 線的端點是在 1.5*IQR處 IQR 或者 Inter Quartile Range 是 25th 和 75th 百分位置的距離 線端點之外的點被認為是極端異常點
設置 varwidth=T
可以自動調整箱子的寬度到合適比例
library(ggplot2) theme_set(theme_classic()) # Plot g <- ggplot(mpg, aes(class, cty)) g + geom_boxplot(varwidth=T, fill="plum") + labs(title="Box plot", subtitle="City Mileage grouped by Class of vehicle", caption="Source: mpg", x="Class of Vehicle", y="City Mileage")
library(ggthemes) g <- ggplot(mpg, aes(class, cty)) g + geom_boxplot(aes(fill=factor(cyl))) + theme(axis.text.x = element_text(angle=65, vjust=0.6)) + labs(title="Box plot", subtitle="City Mileage grouped by Class of vehicle", caption="Source: mpg", x="Class of Vehicle", y="City Mileage")
Dot + Box Plot 點+箱形圖
library(ggplot2) theme_set(theme_bw()) # plot g <- ggplot(mpg, aes(manufacturer, cty)) g + geom_boxplot() + geom_dotplot(binaxis='y', stackdir='center', dotsize = .5, fill="red") + theme(axis.text.x = element_text(angle=65, vjust=0.6)) + labs(title="Box plot + Dot plot", subtitle="City Mileage vs Class: Each dot represents 1 row in source data", caption="Source: mpg", x="Class of Vehicle", y="City Mileage")
Tufte Boxplot
ggthemes
包提供 受啟發於 Edward Tufte.
library(ggthemes) library(ggplot2) theme_set(theme_tufte()) # from ggthemes # plot g <- ggplot(mpg, aes(manufacturer, cty)) g + geom_tufteboxplot() + theme(axis.text.x = element_text(angle=65, vjust=0.6)) + labs(title="Tufte Styled Boxplot", subtitle="City Mileage grouped by Class of vehicle", caption="Source: mpg", x="Class of Vehicle", y="City Mileage")
Violin Plot 小提琴圖
使用 geom_violin()
.
library(ggplot2) theme_set(theme_bw()) # plot g <- ggplot(mpg, aes(class, cty)) g + geom_violin() + labs(title="Violin plot", subtitle="City Mileage vs Class of vehicle", caption="Source: mpg", x="Class of Vehicle", y="City Mileage")
Population Pyramid 人口金字塔圖
男性女性用戶在不同階段類別的分布
library(ggplot2) library(ggthemes) options(scipen = 999) # turns of scientific notations like 1e+40 # Read data email_campaign_funnel <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/email_campaign_funnel.csv") # X Axis Breaks and Labels brks <- seq(-15000000, 15000000, 5000000) lbls = paste0(as.character(c(seq(15, 0, -5), seq(5, 15, 5))), "m") # Plot ggplot(email_campaign_funnel, aes(x = Stage, y = Users, fill = Gender)) + # Fill column geom_bar(stat = "identity", width = .6) + # draw the bars scale_y_continuous(breaks = brks, # Breaks labels = lbls) + # Labels coord_flip() + # Flip axes labs(title="Email Campaign Funnel") + theme_tufte() + # Tufte theme from ggfortify theme(plot.title = element_text(hjust = .5), axis.ticks = element_blank()) + # Centre plot title scale_fill_brewer(palette = "Dark2") # Color palette
5. Composition 成分圖
Waffle Chart 華夫圖
展示總人口的種類構成 雖然沒有直接的函數 但是可以通過巧妙地使用 geom_tile()函數實現
var <- mpg$class # the categorical data ## Prep data (nothing to change here) nrows <- 10 df <- expand.grid(y = 1:nrows, x = 1:nrows) categ_table <- round(table(var) * ((nrows*nrows)/(length(var)))) categ_table #> 2seater compact midsize minivan pickup subcompact suv #> 2 20 18 5 14 15 26 df$category <- factor(rep(names(categ_table), categ_table)) # NOTE: if sum(categ_table) is not 100 (i.e. nrows^2), it will need adjustment to make the sum to 100. ## Plot ggplot(df, aes(x = x, y = y, fill = category)) + geom_tile(color = "black", size = 0.5) + scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0, 0), trans = 'reverse') + scale_fill_brewer(palette = "Set3") + labs(title="Waffle Chart", subtitle="'Class' of vehicles", caption="Source: mpg") + theme(panel.border = element_rect(size = 2), plot.title = element_text(size = rel(1.2)), axis.text = element_blank(), axis.title = element_blank(), axis.ticks = element_blank(), legend.title = element_blank(), legend.position = "right")
Pie Chart 餅圖
使用 coord_polar()
.
library(ggplot2) theme_set(theme_classic()) # Source: Frequency table df <- as.data.frame(table(mpg$class)) colnames(df) <- c("class", "freq") pie <- ggplot(df, aes(x = "", y=freq, fill = factor(class))) + geom_bar(width = 1, stat = "identity") + theme(axis.line = element_blank(), plot.title = element_text(hjust=0.5)) + labs(fill="class", x=NULL, y=NULL, title="Pie Chart of class", caption="Source: mpg") pie + coord_polar(theta = "y", start=0) # Source: Categorical variable. # mpg$class pie <- ggplot(mpg, aes(x = "", fill = factor(class))) + geom_bar(width = 1) + theme(axis.line = element_blank(), plot.title = element_text(hjust=0.5)) + labs(fill="class", x=NULL, y=NULL, title="Pie Chart of class", caption="Source: mpg") pie + coord_polar(theta = "y", start=0)
Treemap 樹形圖
通過內嵌矩形展示分級數據 treemapify
包提供需要的函數把數據轉換為想要的格式 (treemapify
) 並且畫出圖形 (ggplotify
).
必須使用 treemapify()轉換數據格式 數據要求:
one variable each that describes the area
of the tiles, variable for fill
color, variable that has the tile’s label
and finally the parent group
.
一旦格式轉換好 只需要調用 ggplotify()
這個例子有點問題
library(ggplot2) library(treemapify) proglangs <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/proglanguages.csv") # plot treeMapCoordinates <- treemapify(proglangs, area = "value", fill = "parent", label = "id", group = "parent") treeMapPlot <- ggplotify(treeMapCoordinates) + scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0, 0)) + scale_fill_brewer(palette = "Dark2") print(treeMapPlot)
Bar Chart 條形圖
- 設置
stat=identity
- 提供
x
和y
在aes()
中,x
是character
或者factor
,y
是數值變量
# prep frequency table freqtable <- table(mpg$manufacturer) df <- as.data.frame.table(freqtable) head(df) #> Var1 Freq #> 1 audi 18 #> 2 chevrolet 19 #> 3 dodge 37 #> 4 ford 25 #> 5 honda 9 #> 6 hyundai 14
# 畫圖 library(ggplot2) theme_set(theme_classic()) # Plot g <- ggplot(df, aes(Var1, Freq)) g + geom_bar(stat="identity", width = 0.5, fill="tomato2") + labs(title="Bar Chart", subtitle="Manufacturer of vehicles", caption="Source: Frequency of Manufacturers from 'mpg' dataset") + theme(axis.text.x = element_text(angle=65, vjust=0.6))
如果只提供 X 並且 stat=identity
沒有被設置
# From on a categorical column variable g <- ggplot(mpg, aes(manufacturer)) g + geom_bar(aes(fill=class), width = 0.5) + theme(axis.text.x = element_text(angle=65, vjust=0.6)) + labs(title="Categorywise Bar Chart", subtitle="Manufacturer of vehicles", caption="Source: Manufacturers from 'mpg' dataset")
6. 變化
Time Series Plot From a Time Series Object (ts
) 使用時間序列對象來畫時間序列圖
ggfortify
可以直接畫 (ts
).
## AirPassengers是 Timeseries object (ts) library(ggplot2) library(ggfortify) theme_set(theme_classic()) # Plot autoplot(AirPassengers) + labs(title="AirPassengers") + theme(plot.title = element_text(hjust=0.5))
Time Series Plot From a Data Frame 從數據框畫時間序列圖
使用geom_line()
, data.frame
Default X Axis Labels
library(ggplot2) theme_set(theme_classic()) # Allow Default X Axis Labels ggplot(economics, aes(x=date)) + geom_line(aes(y=returns_perc)) + labs(title="Time Series Chart", subtitle="Returns Percentage from 'Economics' Dataset", caption="Source: Economics", y="Returns %")
Time Series Plot For a Monthly Time Series 畫月為間隔的時間序列圖形
使用 scale_x_date()設置自己想要的時間間隔
library(ggplot2) library(lubridate) theme_set(theme_bw()) economics_m <- economics[1:24, ] # X軸的標簽和間斷 lbls <- paste0(month.abb[month(economics_m$date)], " ", lubridate::year(economics_m$date)) brks <- economics_m$date # plot ggplot(economics_m, aes(x=date)) + geom_line(aes(y=returns_perc)) + labs(title="Monthly Time Series", subtitle="Returns Percentage from Economics Dataset", caption="Source: Economics", y="Returns %") + # title and caption scale_x_date(labels = lbls, breaks = brks) + # 變為 monthly ticks and labels theme(axis.text.x = element_text(angle = 90, vjust=0.5), panel.grid.minor = element_blank()) #關閉次級間斷
Calendar Heatmap 日歷熱力圖
使用 geom_tile 需要更多的數據准備工作
# http://margintale.blogspot.in/2012/04/ggplot2-time-series-heatmaps.html library(ggplot2) library(plyr) library(scales) library(zoo) df <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/yahoo.csv") df$date <- as.Date(df$date) # format date df <- df[df$year >= 2012, ] # filter reqd years # Create Month Week df$yearmonth <- as.yearmon(df$date) df$yearmonthf <- factor(df$yearmonth) df <- ddply(df,.(yearmonthf), transform, monthweek=1+week-min(week)) # compute week number of month df <- df[, c("year", "yearmonthf", "monthf", "week", "monthweek", "weekdayf", "VIX.Close")] head(df) #> year yearmonthf monthf week monthweek weekdayf VIX.Close #> 1 2012 Jan 2012 Jan 1 1 Tue 22.97 #> 2 2012 Jan 2012 Jan 1 1 Wed 22.22 #> 3 2012 Jan 2012 Jan 1 1 Thu 21.48 #> 4 2012 Jan 2012 Jan 1 1 Fri 20.63 #> 5 2012 Jan 2012 Jan 2 2 Mon 21.07 #> 6 2012 Jan 2012 Jan 2 2 Tue 20.69 # Plot ggplot(df, aes(monthweek, weekdayf, fill = VIX.Close)) + geom_tile(colour = "white") + facet_grid(year~monthf) + scale_fill_gradient(low="red", high="green") + labs(x="Week of Month", y="", title = "Time-Series Calendar Heatmap", subtitle="Yahoo Closing Price", fill="Close")
Slope Chart 坡度圖
適合僅有幾個時間點時
library(dplyr) theme_set(theme_classic()) source_df <- read.csv("https://raw.githubusercontent.com/jkeirstead/r-slopegraph/master/cancer_survival_rates.csv") # Define functions. Source: https://github.com/jkeirstead/r-slopegraph tufte_sort <- function(df, x="year", y="value", group="group", method="tufte", min.space=0.05) { ## First rename the columns for consistency ids <- match(c(x, y, group), names(df)) df <- df[,ids] names(df) <- c("x", "y", "group") ## Expand grid to ensure every combination has a defined value tmp <- expand.grid(x=unique(df$x), group=unique(df$group)) tmp <- merge(df, tmp, all.y=TRUE) df <- mutate(tmp, y=ifelse(is.na(y), 0, y)) ## Cast into a matrix shape and arrange by first column require(reshape2) tmp <- dcast(df, group ~ x, value.var="y") ord <- order(tmp[,2]) tmp <- tmp[ord,] min.space <- min.space*diff(range(tmp[,-1])) yshift <- numeric(nrow(tmp)) ## Start at "bottom" row ## Repeat for rest of the rows until you hit the top for (i in 2:nrow(tmp)) { ## Shift subsequent row up by equal space so gap between ## two entries is >= minimum mat <- as.matrix(tmp[(i-1):i, -1]) d.min <- min(diff(mat)) yshift[i] <- ifelse(d.min < min.space, min.space - d.min, 0) } tmp <- cbind(tmp, yshift=cumsum(yshift)) scale <- 1 tmp <- melt(tmp, id=c("group", "yshift"), variable.name="x", value.name="y") ## Store these gaps in a separate variable so that they can be scaled ypos = a*yshift + y tmp <- transform(tmp, ypos=y + scale*yshift) return(tmp) } plot_slopegraph <- function(df) { ylabs <- subset(df, x==head(x,1))$group yvals <- subset(df, x==head(x,1))$ypos fontSize <- 3 gg <- ggplot(df,aes(x=x,y=ypos)) + geom_line(aes(group=group),colour="grey80") + geom_point(colour="white",size=8) + geom_text(aes(label=y), size=fontSize, family="American Typewriter") + scale_y_continuous(name="", breaks=yvals, labels=ylabs) return(gg) } ## Prepare data df <- tufte_sort(source_df, x="year", y="value", group="group", method="tufte", min.space=0.05) df <- transform(df, x=factor(x, levels=c(5,10,15,20), labels=c("5 years","10 years","15 years","20 years")), y=round(y)) ## Plot plot_slopegraph(df) + labs(title="Estimates of % survival rates") + theme(axis.title=element_blank(), axis.ticks = element_blank(), plot.title = element_text(hjust=0.5, family = "American Typewriter", face="bold"), axis.text = element_text(family = "American Typewriter", face="bold"))
Seasonal Plot 季節變化圖
使用 forecast::ggseasonplot
library(ggplot2) library(forecast) theme_set(theme_classic()) # Subset data nottem_small <- window(nottem, start=c(1920, 1), end=c(1925, 12)) # subset a smaller timewindow # Plot ggseasonplot(AirPassengers) + labs(title="Seasonal plot: International Airline Passengers") ggseasonplot(nottem_small) + labs(title="Seasonal plot: Air temperatures at Nottingham Castle")
7. Groups
Hierarchical Dendrogram 等級樹狀圖
# install.packages("ggdendro") library(ggplot2) library(ggdendro) theme_set(theme_bw()) hc <- hclust(dist(USArrests), "ave") # hierarchical clustering # plot ggdendrogram(hc, rotate = TRUE, size = 2)
Clusters 聚類圖
使用 geom_encircle()圈出類 輸入變量為一個子數據框
# devtools::install_github("hrbrmstr/ggalt") library(ggplot2) library(ggalt) library(ggfortify) theme_set(theme_classic()) # Compute data with principal components ------------------ df <- iris[c(1, 2, 3, 4)] pca_mod <- prcomp(df) # compute principal components # Data frame of principal components ---------------------- df_pc <- data.frame(pca_mod$x, Species=iris$Species) # dataframe of principal components df_pc_vir <- df_pc[df_pc$Species == "virginica", ] # df for 'virginica' df_pc_set <- df_pc[df_pc$Species == "setosa", ] # df for 'setosa' df_pc_ver <- df_pc[df_pc$Species == "versicolor", ] # df for 'versicolor' # Plot ---------------------------------------------------- ggplot(df_pc, aes(PC1, PC2, col=Species)) + geom_point(aes(shape=Species), size=2) + # draw points labs(title="Iris Clustering", subtitle="With principal components PC1 and PC2 as X and Y axis", caption="Source: Iris") + coord_cartesian(xlim = 1.2 * c(min(df_pc$PC1), max(df_pc$PC1)), ylim = 1.2 * c(min(df_pc$PC2), max(df_pc$PC2))) + # change axis limits geom_encircle(data = df_pc_vir, aes(x=PC1, y=PC2)) + # draw circles geom_encircle(data = df_pc_set, aes(x=PC1, y=PC2)) + geom_encircle(data = df_pc_ver, aes(x=PC1, y=PC2))
8. Spatial 空間分布圖
ggmap
包使得可以 和google maps api 進行交互 同時獲取想要的地方的坐標位置
使用 geocode()函數獲取地點的坐標 使用
qmap()
獲取地圖
maptype決定地圖類型
設定 zoom
來選擇縮放尺度大小 默認為10
# 最好下載 dev 版本 ---------- # devtools::install_github("dkahle/ggmap") # devtools::install_github("hrbrmstr/ggalt") # load packages library(ggplot2) library(ggmap) library(ggalt) # Get Chennai's Coordinates -------------------------------- chennai <- geocode("Chennai") # get longitude and latitude # Get the Map ---------------------------------------------- # Google Satellite Map chennai_ggl_sat_map <- qmap("chennai", zoom=12, source = "google", maptype="satellite") # Google Road Map chennai_ggl_road_map <- qmap("chennai", zoom=12, source = "google", maptype="roadmap") # Google Hybrid Map chennai_ggl_hybrid_map <- qmap("chennai", zoom=12, source = "google", maptype="hybrid") # Open Street Map chennai_osm_map <- qmap("chennai", zoom=12, source = "osm") # Get Coordinates for Chennai's Places --------------------- chennai_places <- c("Kolathur", "Washermanpet", "Royapettah", "Adyar", "Guindy") places_loc <- geocode(chennai_places) # get longitudes and latitudes # Plot Open Street Map ------------------------------------- chennai_osm_map + geom_point(aes(x=lon, y=lat), data = places_loc, alpha = 0.7, size = 7, color = "tomato") + geom_encircle(aes(x=lon, y=lat), data = places_loc, size = 2, color = "blue") # Plot Google Road Map ------------------------------------- chennai_ggl_road_map + geom_point(aes(x=lon, y=lat), data = places_loc, alpha = 0.7, size = 7, color = "tomato") + geom_encircle(aes(x=lon, y=lat), data = places_loc, size = 2, color = "blue") # Google Hybrid Map ---------------------------------------- chennai_ggl_hybrid_map + geom_point(aes(x=lon, y=lat), data = places_loc, alpha = 0.7, size = 7, color = "tomato") + geom_encircle(aes(x=lon, y=lat), data = places_loc, size = 2, color = "blue")
參考:
http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html