chap1 数据可视化的作用

作者：带土；Daitu; Adam 邮箱：2505131775@qq.com

该章节主要是数据可视化的作用绘制的一些辅助图像

library(ggplot2)

theme_set(theme_bw(base_family = "STKaiti"))

帮助数据清洗

1、可以快速帮助研究者检查数据的缺失值，及数据的分布情况，帮助数据清洗的快速完成。

library(VIM)

## Loading required package: colorspace

## Loading required package: grid

## Loading required package: data.table

## VIM is ready to use. 
##  Since version 4.0.0 the GUI is in its own package VIMGUI.
## 
##           Please use the package to use the new (and old) GUI.

## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues

## 
## Attaching package: 'VIM'

## The following object is masked from 'package:datasets':
## 
##     sleep

data("sleep")
head(sleep)

##    BodyWgt BrainWgt NonD Dream Sleep Span Gest Pred Exp Danger
## 1 6654.000   5712.0   NA    NA   3.3 38.6  645    3   5      3
## 2    1.000      6.6  6.3   2.0   8.3  4.5   42    3   1      3
## 3    3.385     44.5   NA    NA  12.5 14.0   60    1   1      1
## 4    0.920      5.7   NA    NA  16.5   NA   25    5   2      3
## 5 2547.000   4603.0  2.1   1.8   3.9 69.0  624    3   5      4
## 6   10.550    179.5  9.1   0.7   9.8 27.0  180    4   4      4

## 可视化数据缺失值
aggr(sleep)

## 可视化缺失数据的分布
par(mfrow = c(2,2))
hist(sleep$NonD,breaks = 12, col = "lightblue")
hist(sleep$Dream,breaks = 12, col = "lightblue")
hist(sleep$Span,breaks = 12, col = "lightblue")
hist(sleep$Gest,breaks = 12, col = "lightblue")

使用线性回归方程，说明数据可视化的作用

2、通过对数据进行可视化，可以发现数据中是否存在异常值，或者对数据选择合适的模型。

library(gridExtra)
lmdata4 <- read.csv("data/lmdata4.csv")
head(lmdata4)

##   x1   y1 x2   y2 x3    y3 x4   y4
## 1 10 8.04 10 9.14 10  7.46  8 6.58
## 2  8 6.95  8 8.14  8  6.77  8 5.76
## 3 13 7.58 13 8.74 13 12.74  8 7.71
## 4  9 8.81  9 8.77  9  7.11  8 8.84
## 5 11 8.33 11 9.26 11  7.81  8 8.47
## 6 14 9.96 14 8.10 14  8.84  8 7.04

## 可视化数据的走势，使用合适的处理方法
p1 <- ggplot(lmdata4,aes(x=x1,y=y1))+
  geom_point()+geom_smooth(method = "lm")
p2 <- ggplot(lmdata4,aes(x=x2,y=y2))+
  geom_point()+geom_smooth(method = "lm")
p3 <- ggplot(lmdata4,aes(x=x3,y=y3))+
  geom_point()+geom_smooth(method = "lm")
p4 <- ggplot(lmdata4,aes(x=x4,y=y4))+
  geom_point()+geom_smooth(method = "lm")

gridExtra::grid.arrange(p1,p2,p3,p4,nrow=2)

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

通过可视化散点图可视化，选择合适的建模方法

## 可视化适合Kmeans算法的图像
kdata <- data.frame(rbind(matrix(rnorm(100, sd = 0.3), ncol = 2),
           matrix(rnorm(100, mean = 2, sd = 0.4), ncol = 2)))
colnames(kdata) <- c("X1", "X2")
p1 <- ggplot(kdata,aes(x = X1,y = X2))+geom_point()

## 可视化双月图像,适合密度聚类
moons <- read.csv("data/moonsdatas.csv")
head(moons)

##           X1          X2 Y
## 1  0.7424201  0.58556710 0
## 2  1.7444393  0.03909624 1
## 3  1.6934791 -0.19061851 1
## 4  0.7395695  0.63927458 0
## 5 -0.3780247  0.97481407 0
## 6  0.8943966  0.26841801 0

p2 <- ggplot(moons,aes(x=X1,y=X2))+geom_point()

grid.arrange(p1,p2,nrow=1)

3、通过数据可视化，研究数据特征之间的关系。如相关系数。

library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

data("iris")
head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

## 矩阵散点图分析变量之间的相关性
ggscatmat(iris,columns = 1:4,color = "Species",alpha = 0.8)

## 相关系数热力图
library(ggcorrplot)
data(mtcars)
corr <- round(cor(mtcars), 2)
corr

##        mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
## mpg   1.00 -0.85 -0.85 -0.78  0.68 -0.87  0.42  0.66  0.60  0.48 -0.55
## cyl  -0.85  1.00  0.90  0.83 -0.70  0.78 -0.59 -0.81 -0.52 -0.49  0.53
## disp -0.85  0.90  1.00  0.79 -0.71  0.89 -0.43 -0.71 -0.59 -0.56  0.39
## hp   -0.78  0.83  0.79  1.00 -0.45  0.66 -0.71 -0.72 -0.24 -0.13  0.75
## drat  0.68 -0.70 -0.71 -0.45  1.00 -0.71  0.09  0.44  0.71  0.70 -0.09
## wt   -0.87  0.78  0.89  0.66 -0.71  1.00 -0.17 -0.55 -0.69 -0.58  0.43
## qsec  0.42 -0.59 -0.43 -0.71  0.09 -0.17  1.00  0.74 -0.23 -0.21 -0.66
## vs    0.66 -0.81 -0.71 -0.72  0.44 -0.55  0.74  1.00  0.17  0.21 -0.57
## am    0.60 -0.52 -0.59 -0.24  0.71 -0.69 -0.23  0.17  1.00  0.79  0.06
## gear  0.48 -0.49 -0.56 -0.13  0.70 -0.58 -0.21  0.21  0.79  1.00  0.27
## carb -0.55  0.53  0.39  0.75 -0.09  0.43 -0.66 -0.57  0.06  0.27  1.00

ggcorrplot(corr,method = "circle",type = "upper",title = "corr heatmap",
           show.diag = TRUE,ggtheme = ggplot2::theme_bw,lab = TRUE,
           lab_size = 2.5)

4 :通过图像分析数据的变化趋势。

## 折线图分析数据的变化情况
data(AirPassengers)
plot(AirPassengers)

library(zoo)

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(stringr)
library(RColorBrewer)
## 数据准备
data(AirPassengers)
AirPas <- as.data.frame(AirPassengers)
AirPas$x <- as.numeric(AirPas$x)
AirPas$yearmonth <- yearmon(time(AirPassengers))
AirPas$year <- format(AirPas$yearmonth,format = "%Y")
AirPas$month <- format(AirPas$yearmonth,"%B")
## 热力图分析数据的变化
AirPas$month <- factor(AirPas$month,levels = c("一月","二月","三月",
                               "四月","五月", "六月","七月",
                               "八月","九月", "十月","十一月",
                               "十二月"))
head(AirPas)

##     x yearmonth year month
## 1 112    1 1949 1949  一月
## 2 118    2 1949 1949  二月
## 3 132    3 1949 1949  三月
## 4 129    4 1949 1949  四月
## 5 121    5 1949 1949  五月
## 6 135    6 1949 1949  六月

ggplot(AirPas,aes(x=year,y=month))+
  geom_tile(aes(fill = x))+
  scale_fill_gradientn(colours=rev(brewer.pal(10,"RdYlGn")))+
  geom_text(aes(label=x),size=2.5)+
  ggtitle("飞机乘客数量变化")

5、通过数据可视化，判断建立的数据分析模型是否是正确的，模型是否还能更好。

library(cluster)
library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

data("iris")
head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

## 数据kmeans聚类为两类
k2 <- kmeans(iris[,1:4],centers = 2)
k3 <- kmeans(iris[,1:4],centers = 3)
k4 <- kmeans(iris[,1:4],centers = 4)
k5 <- kmeans(iris[,1:4],centers = 5)
p2 <- fviz_cluster(k2,iris[,1:4],geom = "point",main = "2 clusters")
p3 <- fviz_cluster(k3,iris[,1:4],geom = "point",main = "3 clusters")
p4 <- fviz_cluster(k4,iris[,1:4],geom = "point",main = "4 clusters")
p5 <- fviz_cluster(k5,iris[,1:4],geom = "point",main = "5 clusters")

grid.arrange(p2,p3,p4,p5,nrow=2)

可视化时间序列模型的拟合情况,选择合适的模型

library(prophet)

## Loading required package: Rcpp

## Loading required package: rlang

## 
## Attaching package: 'rlang'

## The following object is masked from 'package:data.table':
## 
##     :=

library(zoo)
library(timeSeries)

## Loading required package: timeDate

## 
## Attaching package: 'timeSeries'

## The following object is masked from 'package:zoo':
## 
##     time<-

## 数据准备
data(AirPassengers)
AirPas <- as.data.frame(AirPassengers)
colnames(AirPas) <- "y"
AirPas$ds <- as.yearmon(time(AirPassengers))
head(AirPas)

##     y      ds
## 1 112  1 1949
## 2 118  2 1949
## 3 132  3 1949
## 4 129  4 1949
## 5 121  5 1949
## 6 135  6 1949

## 模型1
mode1 <- prophet(AirPas,growth = "linear")

## Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.

## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.

## 预测后面两年的数据,并将预测结果可视化
future1 <- make_future_dataframe(mode1, periods = 24,freq = "month")
forecast1 <- predict(mode1, future1)
p1 <- plot(mode1, forecast1)+labs(title = "model 1",x = "")

## 建立不同的模型,模型2
mode2 <- prophet(AirPas,growth = "linear",
                 yearly.seasonality = TRUE,weekly.seasonality = FALSE,
                 daily.seasonality = FALSE,seasonality.mode = "multiplicative")
## 预测后面两年的数据,并将预测结果可视化
future2 <- make_future_dataframe(mode2, periods = 24,freq = "month")
forecast2 <- predict(mode2, future2)
p2 <- plot(mode2, forecast2)+labs(title = "model 2",x = "")


grid.arrange(p1,p2,nrow=2)

6、通过将数据建模的结果可视化，方便对模型的解读和理解。

library(rpart)
library(rpart.plot)
## 可视化决策树模型
data(ptitanic)
head(ptitanic)

##   pclass survived    sex     age sibsp parch
## 1    1st survived female 29.0000     0     0
## 2    1st survived   male  0.9167     1     2
## 3    1st     died female  2.0000     1     2
## 4    1st     died   male 30.0000     1     2
## 5    1st     died female 25.0000     1     2
## 6    1st survived   male 48.0000     0     0

fit <- rpart(survived ~ ., data = ptitanic)
rpart.plot(fit,type = 3,main = "titanic survived")

fit

## n= 1309 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 1309 500 died (0.6180290 0.3819710)  
##     2) sex=male 843 161 died (0.8090154 0.1909846)  
##       4) age>=9.5 796 136 died (0.8291457 0.1708543) *
##       5) age< 9.5 47  22 survived (0.4680851 0.5319149)  
##        10) sibsp>=2.5 20   1 died (0.9500000 0.0500000) *
##        11) sibsp< 2.5 27   3 survived (0.1111111 0.8888889) *
##     3) sex=female 466 127 survived (0.2725322 0.7274678)  
##       6) pclass=3rd 216 106 died (0.5092593 0.4907407)  
##        12) sibsp>=2.5 21   3 died (0.8571429 0.1428571) *
##        13) sibsp< 2.5 195  92 survived (0.4717949 0.5282051)  
##          26) age>=16.5 162  79 died (0.5123457 0.4876543)  
##            52) parch>=3.5 9   1 died (0.8888889 0.1111111) *
##            53) parch< 3.5 153  75 survived (0.4901961 0.5098039)  
##             106) age>=27.5 44  17 died (0.6136364 0.3863636) *
##             107) age< 27.5 109  48 survived (0.4403670 0.5596330)  
##               214) age< 21.5 28  11 died (0.6071429 0.3928571) *
##               215) age>=21.5 81  31 survived (0.3827160 0.6172840) *
##          27) age< 16.5 33   9 survived (0.2727273 0.7272727) *
##       7) pclass=1st,2nd 250  17 survived (0.0680000 0.9320000) *

## 可视化神将网络的模型
library(neuralnet)
library(NeuralNetTools)
data("iris")
iris$Species <- as.integer(iris$Species )
head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2       1
## 2          4.9         3.0          1.4         0.2       1
## 3          4.7         3.2          1.3         0.2       1
## 4          4.6         3.1          1.5         0.2       1
## 5          5.0         3.6          1.4         0.2       1
## 6          5.4         3.9          1.7         0.4       1

mlpfit <- neuralnet(Species~Sepal.Length+Sepal.Width+Petal.Length+Petal.Width,
                    data = iris,hidden = c(5,8,5),linear.output = FALSE)

par(cex = 0.6)
plotnet(mlpfit,pos_col = "red", neg_col = "grey")

图像通常所包含的内容

library(ggplot2)
head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2       1
## 2          4.9         3.0          1.4         0.2       1
## 3          4.7         3.2          1.3         0.2       1
## 4          4.6         3.1          1.5         0.2       1
## 5          5.0         3.6          1.4         0.2       1
## 6          5.4         3.9          1.7         0.4       1

iris$Species <- as.factor(iris$Species)

ggplot(iris,aes(x = Sepal.Length,y = Sepal.Width))+
  theme_dark(base_family = "STKaiti")+
  geom_point(aes(shape = Species,colour = Species))+
  labs(x = "X轴",y = "Y轴",
       title = "图像名称",subtitle = "可视化图子名称",
       shape = "图例",colour = "图例")+
  theme(plot.title = element_text(hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5),
        legend.title = element_text(hjust = 0.5),
        legend.position = c(0.1,0.8))+
  geom_text(aes(x=6,y = 3,label = "绘图区"),
            family= "STKaiti",size = 20,colour = "lightblue",
            alpha = 0.6)