weather_visualition3.R

## 天气数据的可视化分析

## 改文章主要对天气数据进行可视化分析，突出统计学中的数据可视化在天气预报中的应用



## 读取中国的数据
## 全国县级城市201512实时天气数据(预处理后).csv
## 该数据集包括全国县级城市在2015年12月每个监测点的实时监测数据

## 加载包
library(readr)

## Warning: package 'readr' was built under R version 3.3.2

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.3.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.3.2

library(climatol)
library(corrplot)

## Warning: package 'corrplot' was built under R version 3.3.2

## corrplot 0.84 loaded

library(GGally)

## Warning: package 'GGally' was built under R version 3.3.2

## 
## Attaching package: 'GGally'

## The following object is masked from 'package:dplyr':
## 
##     nasa

library(REmap)
library(leaflet)

## Warning: package 'leaflet' was built under R version 3.3.2

library(viridis)

## Loading required package: viridisLite

library(cluster)

## Warning: package 'cluster' was built under R version 3.3.2

library(fpc)
library(gridExtra)

## Warning: package 'gridExtra' was built under R version 3.3.2

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(readxl)

## Warning: package 'readxl' was built under R version 3.3.2

library(stringr)
library(zoo)

## Warning: package 'zoo' was built under R version 3.3.2

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(RColorBrewer)
library(forecast)

## Warning: package 'forecast' was built under R version 3.3.2

library(ggfortify)
library(tseries)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following object is masked from 'package:base':
## 
##     date

## 读取数据
weather <- read_csv("全国县级城市201512实时天气数据(预处理后).csv")

## Parsed with column specification:
## cols(
##   province = col_character(),
##   city = col_character(),
##   town = col_character(),
##   temperature = col_integer(),
##   relative_humidity = col_integer(),
##   rainfall = col_integer(),
##   wind_direction = col_integer(),
##   wind_strong = col_integer(),
##   day = col_date(format = ""),
##   hour = col_character(),
##   year = col_integer(),
##   month = col_integer(),
##   days = col_character()
## )

chinaweather <- data.frame(weather)
## 查看我们的数据
summary(chinaweather)

##    province             city               town          
##  Length:1812240     Length:1812240     Length:1812240    
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##   temperature      relative_humidity    rainfall        wind_direction 
##  Min.   :-43.000   Min.   :  0.00    Min.   : 0.00000   Min.   :  0.0  
##  1st Qu.: -2.000   1st Qu.: 57.00    1st Qu.: 0.00000   1st Qu.: 77.0  
##  Median :  4.000   Median : 76.00    Median : 0.00000   Median :187.0  
##  Mean   :  2.696   Mean   : 70.96    Mean   : 0.03723   Mean   :181.2  
##  3rd Qu.:  9.000   3rd Qu.: 88.00    3rd Qu.: 0.00000   3rd Qu.:283.0  
##  Max.   : 33.000   Max.   :103.00    Max.   :32.00000   Max.   :360.0  
##   wind_strong          day                 hour                year     
##  Min.   : 0.000   Min.   :2015-12-01   Length:1812240     Min.   :2015  
##  1st Qu.: 0.000   1st Qu.:2015-12-08   Class :character   1st Qu.:2015  
##  Median : 1.000   Median :2015-12-15   Mode  :character   Median :2015  
##  Mean   : 1.733   Mean   :2015-12-15                      Mean   :2015  
##  3rd Qu.: 3.000   3rd Qu.:2015-12-23                      3rd Qu.:2015  
##  Max.   :31.000   Max.   :2015-12-30                      Max.   :2015  
##      month        days          
##  Min.   :12   Length:1812240    
##  1st Qu.:12   Class :character  
##  Median :12   Mode  :character  
##  Mean   :12                     
##  3rd Qu.:12                     
##  Max.   :12

## 我们使用折线图展示一维数据，主要温度、相对湿度、降雨量、风力
## 分析上海的数据
shanghai<- chinaweather[chinaweather$province == "上海",]
## 数据准备
shanghai_day <- shanghai %>%
  dplyr::group_by(city,days)  %>%  ## 按照城市和天分组
  summarise(MeanTemperature = mean(temperature),  #平均温度
            MeanRelative_humidity = mean(relative_humidity),  #平均相对湿度
            MeanRainfall = mean(rainfall) * 24,  #每天平均降雨量
            MeanWind_strong = mean(wind_strong)) #平均风力

## Warning: package 'bindrcpp' was built under R version 3.3.2

## 绘制折线图
## 转换数据格式
shanghai_day <- data.frame(days = rep(1:30,4),
                           values = as.vector(as.matrix(shanghai_day[,3:6])),
                           group = rep(c("平均温度(℃)","平均相对湿度","日降雨量(mm)","平均风力(m/s)"),
                                       each = dim(shanghai_day)[1]))

ggplot(shanghai_day) +theme_bw(base_family = "STKaiti") +
  geom_line(aes(days,values),colour = "red") +
  facet_wrap(~group,scales = "free") + 
  scale_x_continuous(breaks = seq(1,30,4),labels = paste(seq(1,30,4),"日",sep = "")) + 
  theme(plot.title = element_text(hjust = 0.5)) +
  labs(y="数值",title ="2015年12月份上海天气") +
  xlab("日期")

## 1:针对风向和风力数据，可以绘制风向玫瑰图
shanghai<- chinaweather[chinaweather$province == "上海",]
##  将风向划分为16个区间，代表16个方向
## 切分方向
dirbreak <- seq(-12.25,360,22.5)
## 代表的方向
dirdengji <-c("N","NNE","NE","ENE","E","ESE","SE","SSE","S","SSW","SW","WSW","W","WNW","NW","NNW")
## 调整角度
index <- shanghai$wind_direction > 347.25
shanghai$wind_direction[index] <- shanghai$wind_direction[index] - 360
summary(shanghai$wind_direction)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -12.00   66.75  237.00  186.20  295.00  347.00

shanghai$wind_direction2 <- cut(shanghai$wind_direction,breaks = dirbreak,
                                labels = dirdengji,include.lowest = TRUE)
summary(shanghai$wind_direction2)

##    N  NNE   NE  ENE    E  ESE   SE  SSE    S  SSW   SW  WSW    W  WNW   NW 
##  592  587  541  606  553  288  187  129  130  135  161  394  842 1157  933 
##  NNW 
##  685

## 将风力的强弱划分为4各等级
shanghai$wind_strong2 <- cut_interval(shanghai$wind_strong,4)

shwind <- as.data.frame.array(table(shanghai$wind_strong2,shanghai$wind_direction2))

par(family = "STKaiti")
rosavent(shwind, 4, 4, ang=-3*pi/16, margen=c(0,0,2,0),
         col=rainbow(4,0.5,0.92,start=0.1,end=0.9),
         main="上海2015年12月风向玫瑰图")

## 3:分析多个变量之间的关系
## 3.1:上海天气之间的相关性
shanghai<- chinaweather[chinaweather$city == "上海",]
## 分析各个监测点监测到的数据到的相关性-------------------------------------
## 数据准备
shanghai_day <- shanghai %>%
  dplyr::group_by(city,days)  %>%  ## 按照城市和天分组
  summarise(MeanTemperature = mean(temperature),  #平均温度
            MeanRelative_humidity = mean(relative_humidity),  #平均相对湿度
            MeanRainfall = mean(rainfall) * 24,  #每天平均降雨量
            MeanWind_strong = mean(wind_strong)) #平均风力
weacor <- cor(shanghai_day[3:6])
rownames(weacor) <- c("温度","相对湿度","降雨量","风力")
colnames(weacor) <- c("温度","相对湿度","降雨量","风力") 
par(family = "STKaiti",mfrow = c(1,1),cex=0.8)
corrplot(weacor,method = "pie",type = "upper",
         mar = c(2, 2, 2, 10),title = "天气相关系数")

## 绘制矩阵散点图分析
smdata <- data.frame(shanghai_day[3:6])
names(smdata) <- c("温度","相对湿度","降雨量","风力")
ggscatmat(smdata) + theme_bw(base_family = "STKaiti") +
  theme(plot.title = element_text(hjust = 0.5)) +
  labs(x = "",y = "",title = "天气数据")

## 4:通过地图对天气数据进行可视化
## 获取所有城市的信息，并的到数据可视化地图####-------------------------------
citys <- chinaweather %>%
  group_by(city,day)  %>%
  summarise(MeanTemperature = mean(temperature),  #平均温度
            MeanRelative_humidity = mean(relative_humidity),  #平均相对湿度
            MeanRainfall = mean(rainfall) * 24,  #平均每天降雨量
            MeanWind_strong = mean(wind_strong)) %>%#平均风力
  arrange(city)


citys$city <- as.character(citys$city)
## 读取数据
citys_geo <- read_csv("各个城市的经纬度.csv")

## Parsed with column specification:
## cols(
##   lon = col_double(),
##   lat = col_double(),
##   city = col_character()
## )

## 查看两个城市的差集
setdiff(unique(citys$city), citys_geo$city)

## [1] "甘南"

citys <- citys[citys$city != setdiff(unique(citys$city), citys_geo$city),]
## 连接天气数据集和坐标数据集
newcitys <- left_join(citys,citys_geo,by = c("city" = "city"))

## 绘制可交互的地图 ------------------------
## 提取一天的数据
newcitys_day <- newcitys[newcitys$day == "2015-12-15",]
## 定义颜色 -------------------------
tempcolor <- colorNumeric(viridis(6,option = "plasma"),newcitys_day$MeanTemperature)
rehucolor <- colorNumeric(viridis(6,option = "plasma"),newcitys_day$MeanRelative_humidity)
raincolor <- colorNumeric(viridis(6,option = "viridis"),newcitys_day$MeanRainfall)
windcolor <- colorNumeric(viridis(6,option = "viridis"),newcitys_day$MeanWind_strong)

## 绘制地图--------------------------------
## 天气数据的地图可视化，展示各个地点的天气状况
## 该图像，每个参数使用颜色和大小进行编码，使用鼠标点击圆，会显示相应的点和参数
map <- leaflet(data = newcitys_day,width = 850, height = 600) %>%
  setView(lng = 103,lat = 35 ,zoom = 4) %>%
  addTiles() %>% 
  addPopups(lng = 103, lat = 45, popup = "2015/12/15")%>%
  addCircleMarkers(lng = newcitys_day$lon, lat = newcitys_day$lat,
                   stroke = FALSE,group = "温度",
                   fillOpacity = 0.8,radius = ~ (8 + newcitys_day$MeanTemperature/4),
                   popup = ~paste(newcitys_day$city,round(newcitys_day$MeanTemperature,2),sep = "-温度:"),
                   color = ~tempcolor(newcitys_day$MeanTemperature)) %>%
  addCircleMarkers(lng = newcitys_day$lon, lat = newcitys_day$lat,
                   stroke = FALSE,group = "相对湿度",
                   fillOpacity = 0.8,radius = ~ (2 + newcitys_day$MeanRelative_humidity/10),
                   popup = ~paste(newcitys_day$city,round(newcitys_day$MeanRelative_humidity,2),sep = "-相对湿度:"),
                   color = ~rehucolor(newcitys_day$MeanRelative_humidity)) %>%
  addCircleMarkers(lng = newcitys_day$lon, lat = newcitys_day$lat,
                   stroke = FALSE,group = "降雨量",
                   fillOpacity = 0.8,radius = ~ (4.5 + newcitys_day$MeanRainfall),
                   popup = ~paste(newcitys_day$city,round(newcitys_day$MeanRainfall,2),sep = "-降雨量:"),
                   color = ~raincolor(newcitys_day$MeanRainfall)) %>%
  addCircleMarkers(lng = newcitys_day$lon, lat = newcitys_day$lat,
                   stroke = FALSE,group = "风力",
                   fillOpacity = 0.8,radius = ~ (2 + newcitys_day$MeanWind_strong),
                   popup = ~paste(newcitys_day$city,round(newcitys_day$MeanWind_strong,2),sep = "-风力:"),
                   color = ~windcolor(newcitys_day$MeanWind_strong)) %>%
  addLegend("topleft",pal = tempcolor, values = newcitys_day$MeanTemperature,
            title ="温度(℃)",opacity = 0.5) %>%
  addLegend("topleft",pal = rehucolor, values = newcitys_day$MeanRelative_humidity,
            title = "相对湿度",opacity = 0.5) %>%
  addLayersControl(baseGroups = c("温度","相对湿度","降雨量","风力"),
                   options = layersControlOptions(collapsed = FALSE),
                   position = "topright") %>%
  addLegend("topright",pal = raincolor, values = newcitys_day$MeanRainfall,
            title = "降雨量(mm)",opacity = 0.5) %>%
  addLegend("topright",pal = windcolor, values = newcitys_day$MeanWind_strong,
            title = "风力(m/s)",opacity = 0.5) 
map

## =======================================================
## 数据预处理
## 获取每个城市12月份每天的天气数据
citys <- chinaweather %>%
  group_by(city,day)  %>%
  summarise(MeanTemperature = mean(temperature),  #平均温度
            MeanRelative_humidity = mean(relative_humidity),  #平均相对湿度
            MeanRainfall = mean(rainfall) * 24,  #平均每天降雨量
            MeanWind_strong = mean(wind_strong))#平均风力

## 清洗数据，剔除甘南这个地方

citys <- citys[citys$city != "甘南",]
## 查看我们都有哪些地点
unique(citys$city)

##   [1] "阿坝"     "阿克苏"   "阿拉尔"   "阿拉善盟" "阿勒泰"   "阿里"    
##   [7] "安康"     "安庆"     "安顺"     "安阳"     "鞍山"     "巴彦淖尔"
##  [13] "巴音郭楞" "巴中"     "白城"     "白沙"     "白山"     "白银"    
##  [19] "百色"     "蚌埠"     "包头"     "宝鸡"     "保定"     "保山"    
##  [25] "保亭"     "北海"     "北京"     "本溪"     "毕节"     "滨州"    
##  [31] "亳州"     "博尔塔拉" "沧州"     "昌都"     "昌吉"     "昌江"    
##  [37] "常德"     "常州"     "朝阳"     "潮州"     "郴州"     "成都"    
##  [43] "承德"     "澄迈"     "池州"     "赤峰"     "重庆"     "崇左"    
##  [49] "滁州"     "楚雄"     "达州"     "大理"     "大连"     "大庆"    
##  [55] "大同"     "大兴安岭" "丹东"     "儋州"     "德宏"     "德阳"    
##  [61] "德州"     "迪庆"     "定安"     "定西"     "东方"     "东莞"    
##  [67] "东营"     "鄂尔多斯" "鄂州"     "恩施"     "防城港"   "佛山"    
##  [73] "福州"     "抚顺"     "抚州"     "阜新"     "阜阳"     "甘孜"    
##  [79] "赣州"     "固原"     "广安"     "广元"     "广州"     "贵港"    
##  [85] "贵阳"     "桂林"     "果洛"     "哈尔滨"   "哈密"     "海北"    
##  [91] "海东"     "海口"     "海南"     "海西"     "邯郸"     "汉中"    
##  [97] "杭州"     "合肥"     "和田"     "河池"     "河源"     "菏泽"    
## [103] "贺州"     "鹤壁"     "鹤岗"     "黑河"     "衡水"     "衡阳"    
## [109] "红河"     "呼和浩特" "呼伦贝尔" "湖州"     "葫芦岛"   "怀化"    
## [115] "淮安"     "淮北"     "淮南"     "黄冈"     "黄南"     "黄山"    
## [121] "黄石"     "惠州"     "鸡西"     "吉安"     "吉林"     "济南"    
## [127] "济宁"     "济源"     "佳木斯"   "嘉兴"     "江门"     "焦作"    
## [133] "揭阳"     "金昌"     "金华"     "锦州"     "晋城"     "晋中"    
## [139] "荆门"     "荆州"     "景德镇"   "九江"     "酒泉"     "喀什"    
## [145] "开封"     "克拉玛依" "克州"     "昆明"     "拉萨"     "来宾"    
## [151] "莱芜"     "兰州"     "廊坊"     "乐东"     "乐山"     "丽江"    
## [157] "丽水"     "连云港"   "凉山"     "辽阳"     "辽源"     "聊城"    
## [163] "林芝"     "临沧"     "临汾"     "临高"     "临夏"     "临沂"    
## [169] "陵水"     "柳州"     "六安"     "六盘水"   "龙岩"     "陇南"    
## [175] "娄底"     "泸州"     "吕梁"     "洛阳"     "漯河"     "马鞍山"  
## [181] "茂名"     "眉山"     "梅州"     "绵阳"     "牡丹江"   "那曲"    
## [187] "南昌"     "南充"     "南京"     "南宁"     "南平"     "南通"    
## [193] "南阳"     "内江"     "宁波"     "宁德"     "怒江"     "攀枝花"  
## [199] "盘锦"     "平顶山"   "平凉"     "萍乡"     "莆田"     "濮阳"    
## [205] "普洱"     "七台河"   "齐齐哈尔" "潜江"     "黔东南"   "黔南"    
## [211] "黔西南"   "钦州"     "秦皇岛"   "青岛"     "清远"     "庆阳"    
## [217] "琼海"     "琼中"     "曲靖"     "衢州"     "泉州"     "日喀则"  
## [223] "日照"     "三门峡"   "三明"     "三亚"     "厦门"     "山南"    
## [229] "汕头"     "汕尾"     "商洛"     "商丘"     "上海"     "上饶"    
## [235] "韶关"     "邵阳"     "绍兴"     "深圳"     "神农架"   "沈阳"    
## [241] "十堰"     "石河子"   "石家庄"   "石嘴山"   "双鸭山"   "朔州"    
## [247] "四平"     "松原"     "苏州"     "宿迁"     "宿州"     "绥化"    
## [253] "随州"     "遂宁"     "塔城"     "台州"     "太原"     "泰安"    
## [259] "泰州"     "唐山"     "天津"     "天门"     "天水"     "铁岭"    
## [265] "通化"     "通辽"     "铜川"     "铜陵"     "铜仁"     "吐鲁番"  
## [271] "屯昌"     "万宁"     "威海"     "潍坊"     "渭南"     "温州"    
## [277] "文昌"     "文山"     "乌海"     "乌兰察布" "乌鲁木齐" "无锡"    
## [283] "吴忠"     "芜湖"     "梧州"     "五指山"   "武汉"     "武威"    
## [289] "锡林郭勒" "西安"     "西宁"     "西沙"     "西双版纳" "咸宁"    
## [295] "咸阳"     "仙桃"     "湘潭"     "湘西"     "襄阳"     "孝感"    
## [301] "忻州"     "新乡"     "新余"     "信阳"     "邢台"     "兴安盟"  
## [307] "徐州"     "许昌"     "宣城"     "雅安"     "烟台"     "延安"    
## [313] "延边"     "盐城"     "扬州"     "阳江"     "阳泉"     "杨凌"    
## [319] "伊春"     "伊犁"     "益阳"     "宜宾"     "宜昌"     "宜春"    
## [325] "银川"     "鹰潭"     "营口"     "永州"     "榆林"     "玉林"    
## [331] "玉树"     "玉溪"     "岳阳"     "云浮"     "运城"     "枣庄"    
## [337] "湛江"     "张家界"   "张家口"   "张掖"     "漳州"     "长春"    
## [343] "长沙"     "长治"     "昭通"     "肇庆"     "镇江"     "郑州"    
## [349] "中山"     "中卫"     "舟山"     "周口"     "株洲"     "珠海"    
## [355] "驻马店"   "资阳"     "淄博"     "自贡"     "遵义"

length(unique(citys$city))

## [1] 359

## 一共右359个地点

## 对数据的特征进行准备工作
## 我们一共有一个月的数据，我们的特征可能需要包括，该月四个天气因素的最大值、
##  最小值、均值、标准差等。
citys_weather <- citys %>%
  group_by(city) %>%
  summarise(MeanTemperature2 = mean(MeanTemperature),  #平均温度
            MinTemperature = min(MeanTemperature),  #最低温度
            MaxTemperature = max(MeanTemperature),  #最低温度
            SdTemperature = sd(MeanTemperature),  #温度标准差
            MeanRelative_humidity2 = mean(MeanRelative_humidity),#平均相对湿度
            MinRelative_humidity = min(MeanRelative_humidity),#最小相对湿
            MaxRelative_humidity = max(MeanRelative_humidity),  #最大相对湿
            SdRelative_humidity = sd(MeanRelative_humidity),  #相对湿标准差
            MeanRainfall2 = mean(MeanRainfall),  #平均每天降雨量
            MaxRainfall = max(MeanRainfall),  #最大每天降雨量;因为所有的最小降雨量都为0，所以剔除
            SdRainfall = sd(MeanRainfall),  #每天降雨量标准差
            MeanWind_strong2 = mean(MeanWind_strong),#平均风力
            MinWind_strong = min(MeanWind_strong),#平均风力
            MaxWind_strong = max(MeanWind_strong),#平均风力
            SdWind_strong = sd(MeanWind_strong)#平均风力
  )

## 对我们的特征数据进行标准化
citys_cluster <- apply(citys_weather[,2:16],2,scale)

## ==============================================================
## ===============k均值聚类算法======================================
## ==============================================================
## 使用k均值聚类算法进行聚类
## 探索由聚类数目k的变化
## 计算聚类的残差平方和
set.seed(1234)

## 计算组内平方和  组间平房和
tot_withinss <- vector()
betweenss <- vector()
kk = 30
for(ii in 1:kk){
  k1 <- kmeans(citys_cluster,ii,iter.max = 100,algorithm = "MacQueen")
  tot_withinss[ii] <- k1$tot.withinss
  betweenss[ii] <- k1$betweenss
}

kmeanvalue <- data.frame(kk = 1:kk,
                         tot_withinss = tot_withinss,
                         betweenss = betweenss)


p1 <- ggplot(kmeanvalue,aes(x = kk,y = tot_withinss))+
  theme_bw(base_family = "STKaiti") +
  geom_point(colour = "red") +
  geom_line() +
  labs(x = "kmean 聚类个数",y = "value") +
  ggtitle("簇内平方和总和")+
  theme(plot.title = element_text(hjust = 0.5))


p2 <- ggplot(kmeanvalue,aes(x = kk,y = betweenss))+
  theme_bw(base_family = "STKaiti") +
  geom_point(colour = "red") +
  geom_line() +
  labs(x = "kmean 聚类个数",y = "value") +
  ggtitle("簇间平方和")+
  theme(plot.title = element_text(hjust = 0.5))


grid.arrange(p1,p2,nrow = 2)

## 从上面的k均值聚类图，我们可以发现，我们可以将聚类的个数定为10类,这个类说在5以上都可以
## 因为在10之前，这两个指标变化的比较剧烈，之后变化较为平缓，所以定为10类

## 聚类为k类，查看聚类结果
k = 8  ## 聚类的数目
kcluster <- kmeans(citys_cluster,k,iter.max = 100,algorithm = "MacQueen")
## 输出我们的聚类结果
kcluster

## K-means clustering with 8 clusters of sizes 64, 21, 47, 56, 90, 45, 14, 22
## 
## Cluster means:
##   MeanTemperature2 MinTemperature MaxTemperature SdTemperature
## 1        0.3576381      0.4393421      0.2715715   -0.41879862
## 2        1.3052462      1.1381350      1.4541089    0.09865897
## 3        0.7623438      0.7372296      0.7624496   -0.38224793
## 4       -1.4734001     -1.6704345     -1.2837192    1.66898621
## 5       -0.3132209     -0.1880822     -0.4338724   -0.50966496
## 6        0.4803326      0.4244698      0.5592737   -0.14687127
## 7        1.7348037      1.5453227      1.8146054   -0.13379968
## 8       -0.9695955     -0.7696567     -1.0630455    0.16299761
##   MeanRelative_humidity2 MinRelative_humidity MaxRelative_humidity
## 1             0.45965533            0.9327377           0.02243190
## 2             0.52582301           -0.3582297           0.60376145
## 3             0.84977432            0.1876716           0.63639789
## 4            -0.05294843            0.5710032          -0.22492713
## 5            -0.63514273           -0.9207584           0.03901732
## 6             0.51894233            0.2032209           0.54675735
## 7             0.96531144            0.9444179           0.48195145
## 8            -2.59720112           -1.4758029          -3.01328968
##   SdRelative_humidity MeanRainfall2 MaxRainfall SdRainfall
## 1          -0.9393036    -0.3528499  -0.3567487 -0.3704743
## 2           0.2805187     2.1023911   3.0789295  2.8390010
## 3           0.2998509     1.6021797   1.0297305  1.2195072
## 4          -0.6945036    -0.5435679  -0.4762532 -0.5011817
## 5           1.0342239    -0.5917613  -0.5649016 -0.5938365
## 6           0.2815534    -0.2378063  -0.2046940 -0.2082598
## 7          -0.6753684     0.4283716   0.5163096  0.5178990
## 8          -0.7850521    -0.3849028  -0.4876690 -0.4360427
##   MeanWind_strong2 MinWind_strong MaxWind_strong SdWind_strong
## 1      -1.03341559    -0.67227417  -1.171789e+00   -1.12930655
## 2       0.52119321     0.22414059   2.820168e-01    0.10182832
## 3      -0.23263230    -0.15949801  -4.548059e-01   -0.46206258
## 4       0.47064732     0.22888869   6.672347e-01    0.71489865
## 5      -0.07989664    -0.30280188   2.467000e-01    0.40195893
## 6       0.08140400     0.07392906   5.037447e-05   -0.02064721
## 7       2.73480694     2.37449968   1.953025e+00    1.60459436
## 8       0.22778315     1.07634545   1.606925e-01   -0.26780269
## 
## Clustering vector:
##   [1] 8 1 1 8 4 8 1 6 6 5 4 4 4 1 4 1 4 1 3 6 4 1 5 1 6 7 5 4 1 5 5 4 5 8 4
##  [36] 7 1 6 5 2 3 1 5 3 6 4 1 3 6 6 1 1 4 4 5 4 4 6 1 1 5 5 7 1 7 2 5 4 5 1
##  [71] 7 2 3 4 3 4 5 8 3 5 1 1 2 3 6 3 8 4 4 8 8 7 8 8 5 1 3 6 5 3 2 5 3 5 4
## [106] 4 5 3 1 4 4 6 5 1 6 5 5 6 8 3 6 2 4 3 4 5 5 5 4 6 2 5 2 8 3 5 5 5 5 6
## [141] 3 6 4 5 5 4 8 6 8 3 5 1 5 6 1 5 3 5 1 4 4 5 8 1 5 6 1 5 7 3 6 6 2 1 3
## [176] 1 5 5 5 6 3 1 2 1 4 8 3 1 6 3 3 6 5 1 3 3 1 1 4 5 1 3 2 5 1 4 4 1 1 1
## [211] 6 6 5 5 2 5 7 1 6 3 2 8 5 5 3 7 2 8 2 2 5 5 6 3 2 3 3 2 5 4 1 4 5 5 4
## [246] 5 4 4 6 5 5 4 5 1 4 3 5 5 6 5 5 6 1 4 4 4 5 6 1 1 1 7 4 5 5 3 3 1 5 4
## [281] 4 6 5 6 3 1 6 8 4 1 8 7 1 6 5 6 3 1 5 5 5 5 3 5 5 4 5 5 6 1 4 5 4 6 6
## [316] 2 5 1 4 1 1 1 1 3 5 3 4 3 5 7 8 6 6 3 5 5 7 1 4 8 2 4 3 5 1 3 6 5 2 5
## [351] 7 5 3 3 5 1 5 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 274.73868  90.33927 155.94932 412.32102 342.65000 160.86729 171.57558
## [8] 212.03634
##  (between_SS / total_SS =  66.1 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

## 查看每一类有多少个地点
table(kcluster$cluster)

## 
##  1  2  3  4  5  6  7  8 
## 64 21 47 56 90 45 14 22

## 聚类结果可视化,该图因为聚类个数太多所以可以不分析
clusplot(citys_cluster,kcluster$cluster,main = paste("kmean cluster number=",k,sep = ""))

## 表示聚类效果,聚类中的轮廓信息
si1 <- silhouette(kcluster$cluster,dist(citys_cluster,method = "euclidean"))
par(cex = 0.9)
plot(si1,main = "kmean silhouette",col = "red")

## 从聚类中的轮廓信息轮廓信息图，我们可以发现，我们的聚类效果还是很好的
## 每一类中轮廓图中的直方条形越接近于1的越多，说明这些聚类在一族越好。

## 查看哪些城市都氛围了哪一类
city_clu <- data.frame(city = citys_weather$city,
                       cluster = kcluster$cluster)

city_clu$city <- as.character(city_clu$city)
## 我们将这些类别在地图上展示出来
## 链接数据
## 连接天气数据集和坐标数据集
newcitys <- left_join(city_clu,citys_geo,by = c("city" = "city"))

## 将聚类数据在地图上显示出来
## 定义颜色 -------------------------
clucolor <- colorFactor(viridis(k,option = "viridis"),newcitys$cluster)
## 绘制地图--------------------------------
## k均值聚类结果展示在地图上，用开方便的分析我们的聚类结果
## 该图像，聚类得懂的类别使用颜色进行编码，使用鼠标点击圆，会显示相应的地点和类别
map <- leaflet(data = newcitys,width = 800, height = 600) %>%
  setView(lng = 103,lat = 35 ,zoom = 4) %>%
  addTiles() %>% 
  addCircleMarkers(lng = newcitys$lon, lat = newcitys$lat,
                   stroke = FALSE,group = "类别",
                   fillOpacity = 0.8,radius = 8.5,
                   popup = ~paste(newcitys$city,newcitys$cluster,sep = "-类别为:"),
                   color = ~clucolor(newcitys$cluster)) %>%
  addLegend("topleft",pal = clucolor, values = newcitys$cluster,
            title ="类别",opacity = 0.5) %>%
  addLayersControl(baseGroups = c("K means 聚类结果"),
                   options = layersControlOptions(collapsed = FALSE),
                   position = "topright")
map

## ===================================================






## 太原温度数据可视化
## 读取数据
taiyuan <- read_excel("太原.xlsx",col_names = FALSE) 
names(taiyuan) <- c("time","MeanTemp")
## 整理数据
## 定义时间数据，时间从12年1月开始，到17年5月结束
taiyuan$time <- as.Date(as.yearmon(2012 + seq(0,64)/12))
taiyuan$year <- year(taiyuan$time)
taiyuan$month <- month(taiyuan$time)

## 设定月份
mymonths <- c("January","February","March","April","May","June","July",
              "August","September","October","November","December")

## 等级
month.name <- sort(unique(taiyuan$month))
taiyuan$month2 <- factor(taiyuan$month, levels = month.name,labels = mymonths)
head(taiyuan)

## # A tibble: 6 x 5
##   time       MeanTemp  year month month2  
##   <date>        <dbl> <dbl> <dbl> <fct>   
## 1 2012-01-01      -6. 2012.    1. January 
## 2 2012-02-01      -3. 2012.    2. February
## 3 2012-03-01       4. 2012.    3. March   
## 4 2012-04-01      14. 2012.    4. April   
## 5 2012-05-01      20. 2012.    5. May     
## 6 2012-06-01      22. 2012.    6. June

## ==================================================================
## 数据可视化
## 温度的热力图
ggplot(data=taiyuan, aes(x=year,y=month2)) + 
  theme_bw(base_family = "STKaiti") +
  geom_tile(aes(fill = MeanTemp),colour = "white") + 
  geom_text(aes(label = round(MeanTemp,1))) +
  scale_fill_gradientn(colours=rev(brewer.pal(10,'Spectral'))) + 
  theme(legend.title=element_blank(),
        axis.title.y=element_blank(),
        axis.title.x=element_blank(),
        legend.position="top") + 
  ggtitle("太原每月平均温度") +
  labs(x="年份",y = "月份") +
  theme(plot.title = element_text(hjust = 0.5))

## 散点图盒子图
ggplot(data =taiyuan,aes(x=month2,y=MeanTemp,color=MeanTemp)) +
  theme_bw(base_family = "STKaiti") +
  scale_color_gradientn(colours=rev(brewer.pal(10,'Spectral'))) + 
  geom_boxplot(colour='black',size=.4,alpha=.5) + 
  geom_jitter(shape=16,width=.2,size=2) + 
  theme(legend.title=element_blank(),
        legend.position='top',
        axis.text.x = element_text(angle=45, hjust=1),
        plot.title = element_text(hjust = 0.5)) + 
  scale_y_continuous(breaks = seq(-10,30,5),labels = seq(-10,30,5)) +
  ggtitle("太原每月的平均温度") + 
  xlab('') + ylab('2012～2017温度(℃)')

## 平均气温的曲线图
ggplot(data = taiyuan,aes(x=time,y=MeanTemp)) +
  theme_bw(base_family = "STKaiti") +
  geom_line() +
  geom_point(colour = "red",size = 1) + 
  scale_x_date(date_breaks = "1 year",date_labels = "%Y") +
  xlab('年份') + ylab('温度(℃)') +
  ggtitle("太原每月平均温度") + 
  theme(plot.title = element_text(hjust = 0.5))

## ==============================================================
## ==========使用时间序列分析温度的变化，并进行预测==============
## ==============================================================
## ==============================================================
## ==============================================================

## 定义时间序列数据
tsdata <- ts(data = taiyuan$MeanTemp,start = c(2012,1),frequency = 12)
par(family = "STKaiti")
plot(tsdata,xlab = "时间",ylab = "温度(℃)",main="太原每月平均温度")

## 计算我们的数据个数
length(tsdata)

## [1] 65

## 我们一共有65个数据，我们使用前55个作为模型的训练数据，
## 训练合适的时间序列模型，涌来预测后面的数据
nn = 55
taiyuan[55,]

## # A tibble: 1 x 5
##   time       MeanTemp  year month month2
##   <date>        <dbl> <dbl> <dbl> <fct> 
## 1 2016-07-01      24. 2016.    7. July

tstrain <- ts(taiyuan$MeanTemp[1:nn],start = c(2012,1),frequency = 12)
tstest <- ts(taiyuan$MeanTemp[(nn+1):65],star = c(2016,8),frequency = 12)

## =============================================================================
## 使用新的模型来重新训练季节ARIMA模型
mod2 <- Arima(tstrain,order = c(0,1,1),seasonal = list(order = c(0,1,1)))
mod2

## Series: tstrain 
## ARIMA(0,1,1)(0,1,1)[12] 
## 
## Coefficients:
##           ma1     sma1
##       -0.8545  -0.9985
## s.e.   0.0951   1.0260
## 
## sigma^2 estimated as 1.578:  log likelihood=-77.87
## AIC=161.74   AICc=162.37   BIC=166.95

par(family = "STKaiti")
plot(forecast(mod2,h=10),xlab = "时间",ylab = "温度(℃)",shadecols="oldstyle")

## 在ggplot中绘制我们的图像
X <- forecast(mod2,h=10)
X

##          Point Forecast     Lo 80      Hi 80      Lo 95     Hi 95
## Aug 2016     22.8284798 21.027522 24.6294380 20.0741519 25.582808
## Sep 2016     17.8284824 16.008627 19.6483377 15.0452539 20.611711
## Oct 2016     11.8284827  9.989924 13.6670408  9.0166506 14.640315
## Nov 2016      3.5784879  1.721415  5.4355606  0.7383403  6.418636
## Dec 2016     -3.1715056 -5.046910 -1.2961011 -6.0396892 -0.303322
## Jan 2017     -3.9104158 -5.788602 -2.0322301 -6.7828531 -1.037979
## Feb 2017     -0.9104113 -2.806003  0.9851804 -3.8094686  1.988646
## Mar 2017      7.0895835  5.176744  9.0024227  4.1641483 10.015019
## Apr 2017     13.8895869 11.959654 15.8195195 10.9380097 16.841164
## May 2017     19.4895763 17.542700 21.4364522 16.5120865 22.467066

## 构建显得数据集
tempdatatrain <- data.frame(MeanTemp = taiyuan$MeanTemp[1:nn],
                            time = taiyuan$time[1:nn])
tempdatatrain$class <- "训练数据"
tempdatatest <- data.frame(MeanTemp = taiyuan$MeanTemp[(nn+1):65],
                           time = taiyuan$time[(nn+1):65])
tempdatatest$class <- "测试数据"
tempdatapre <- data.frame(MeanTemp = as.numeric(X$mean),
                          time = tempdatatest$time)
tempdatapre$class <- "预测数据"
## 组合数据
plottemp <- rbind(tempdatatrain,tempdatatest,tempdatapre)

## 获得置信区间的数据
X95 <- data.frame(xlower = X$lower[,2],
                  xupper = X$upper[,2],
                  time = tempdatatest$time,
                  MeanTemp = as.numeric(X$mean))


## 绘制图像
titles <- paste("太原:","ARIMA:(",as.character(mod2$call[3]),
                str_sub(as.character(mod2$call[4]),14,23),"[12])")

ggplot(plottemp,aes(x = time,y = MeanTemp,
                    linetype=class,colour = class,shape = class)) +
  theme_bw(base_family = "STKaiti")+
  geom_line(size = 0.5) +
  labs(x = "时间",y = "温度(℃)",title=titles) +
  geom_point(size = 1.5) + 
  scale_x_date(date_breaks = "1 year",date_labels = "%Y") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(legend.position="top",legend.title = element_blank())

##

weather_visualition3.R

daitu

Sat Jun 9 13:05:59 2018