## 天气数据的可视化分析
## 改文章主要对天气数据进行可视化分析,突出统计学中的数据可视化在天气预报中的应用
## 读取中国的数据
## 全国县级城市201512实时天气数据(预处理后).csv
## 该数据集包括全国县级城市在2015年12月每个监测点的实时监测数据
## 加载包
library(readr)
## Warning: package 'readr' was built under R version 3.3.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
library(climatol)
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.3.2
## corrplot 0.84 loaded
library(GGally)
## Warning: package 'GGally' was built under R version 3.3.2
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
library(REmap)
library(leaflet)
## Warning: package 'leaflet' was built under R version 3.3.2
library(viridis)
## Loading required package: viridisLite
library(cluster)
## Warning: package 'cluster' was built under R version 3.3.2
library(fpc)
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.3.2
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(readxl)
## Warning: package 'readxl' was built under R version 3.3.2
library(stringr)
library(zoo)
## Warning: package 'zoo' was built under R version 3.3.2
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(RColorBrewer)
library(forecast)
## Warning: package 'forecast' was built under R version 3.3.2
library(ggfortify)
library(tseries)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
## 读取数据
weather <- read_csv("全国县级城市201512实时天气数据(预处理后).csv")
## Parsed with column specification:
## cols(
## province = col_character(),
## city = col_character(),
## town = col_character(),
## temperature = col_integer(),
## relative_humidity = col_integer(),
## rainfall = col_integer(),
## wind_direction = col_integer(),
## wind_strong = col_integer(),
## day = col_date(format = ""),
## hour = col_character(),
## year = col_integer(),
## month = col_integer(),
## days = col_character()
## )
chinaweather <- data.frame(weather)
## 查看我们的数据
summary(chinaweather)
## province city town
## Length:1812240 Length:1812240 Length:1812240
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## temperature relative_humidity rainfall wind_direction
## Min. :-43.000 Min. : 0.00 Min. : 0.00000 Min. : 0.0
## 1st Qu.: -2.000 1st Qu.: 57.00 1st Qu.: 0.00000 1st Qu.: 77.0
## Median : 4.000 Median : 76.00 Median : 0.00000 Median :187.0
## Mean : 2.696 Mean : 70.96 Mean : 0.03723 Mean :181.2
## 3rd Qu.: 9.000 3rd Qu.: 88.00 3rd Qu.: 0.00000 3rd Qu.:283.0
## Max. : 33.000 Max. :103.00 Max. :32.00000 Max. :360.0
## wind_strong day hour year
## Min. : 0.000 Min. :2015-12-01 Length:1812240 Min. :2015
## 1st Qu.: 0.000 1st Qu.:2015-12-08 Class :character 1st Qu.:2015
## Median : 1.000 Median :2015-12-15 Mode :character Median :2015
## Mean : 1.733 Mean :2015-12-15 Mean :2015
## 3rd Qu.: 3.000 3rd Qu.:2015-12-23 3rd Qu.:2015
## Max. :31.000 Max. :2015-12-30 Max. :2015
## month days
## Min. :12 Length:1812240
## 1st Qu.:12 Class :character
## Median :12 Mode :character
## Mean :12
## 3rd Qu.:12
## Max. :12
## 我们使用折线图展示一维数据,主要温度、相对湿度、降雨量、风力
## 分析上海的数据
shanghai<- chinaweather[chinaweather$province == "上海",]
## 数据准备
shanghai_day <- shanghai %>%
dplyr::group_by(city,days) %>% ## 按照城市和天分组
summarise(MeanTemperature = mean(temperature), #平均温度
MeanRelative_humidity = mean(relative_humidity), #平均相对湿度
MeanRainfall = mean(rainfall) * 24, #每天平均降雨量
MeanWind_strong = mean(wind_strong)) #平均风力
## Warning: package 'bindrcpp' was built under R version 3.3.2
## 绘制折线图
## 转换数据格式
shanghai_day <- data.frame(days = rep(1:30,4),
values = as.vector(as.matrix(shanghai_day[,3:6])),
group = rep(c("平均温度(℃)","平均相对湿度","日降雨量(mm)","平均风力(m/s)"),
each = dim(shanghai_day)[1]))
ggplot(shanghai_day) +theme_bw(base_family = "STKaiti") +
geom_line(aes(days,values),colour = "red") +
facet_wrap(~group,scales = "free") +
scale_x_continuous(breaks = seq(1,30,4),labels = paste(seq(1,30,4),"日",sep = "")) +
theme(plot.title = element_text(hjust = 0.5)) +
labs(y="数值",title ="2015年12月份上海天气") +
xlab("日期")

## 1:针对风向和风力数据,可以绘制风向玫瑰图
shanghai<- chinaweather[chinaweather$province == "上海",]
## 将风向划分为16个区间,代表16个方向
## 切分方向
dirbreak <- seq(-12.25,360,22.5)
## 代表的方向
dirdengji <-c("N","NNE","NE","ENE","E","ESE","SE","SSE","S","SSW","SW","WSW","W","WNW","NW","NNW")
## 调整角度
index <- shanghai$wind_direction > 347.25
shanghai$wind_direction[index] <- shanghai$wind_direction[index] - 360
summary(shanghai$wind_direction)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -12.00 66.75 237.00 186.20 295.00 347.00
shanghai$wind_direction2 <- cut(shanghai$wind_direction,breaks = dirbreak,
labels = dirdengji,include.lowest = TRUE)
summary(shanghai$wind_direction2)
## N NNE NE ENE E ESE SE SSE S SSW SW WSW W WNW NW
## 592 587 541 606 553 288 187 129 130 135 161 394 842 1157 933
## NNW
## 685
## 将风力的强弱划分为4各等级
shanghai$wind_strong2 <- cut_interval(shanghai$wind_strong,4)
shwind <- as.data.frame.array(table(shanghai$wind_strong2,shanghai$wind_direction2))
par(family = "STKaiti")
rosavent(shwind, 4, 4, ang=-3*pi/16, margen=c(0,0,2,0),
col=rainbow(4,0.5,0.92,start=0.1,end=0.9),
main="上海2015年12月风向玫瑰图")

## 3:分析多个变量之间的关系
## 3.1:上海天气之间的相关性
shanghai<- chinaweather[chinaweather$city == "上海",]
## 分析各个监测点监测到的数据到的相关性-------------------------------------
## 数据准备
shanghai_day <- shanghai %>%
dplyr::group_by(city,days) %>% ## 按照城市和天分组
summarise(MeanTemperature = mean(temperature), #平均温度
MeanRelative_humidity = mean(relative_humidity), #平均相对湿度
MeanRainfall = mean(rainfall) * 24, #每天平均降雨量
MeanWind_strong = mean(wind_strong)) #平均风力
weacor <- cor(shanghai_day[3:6])
rownames(weacor) <- c("温度","相对湿度","降雨量","风力")
colnames(weacor) <- c("温度","相对湿度","降雨量","风力")
par(family = "STKaiti",mfrow = c(1,1),cex=0.8)
corrplot(weacor,method = "pie",type = "upper",
mar = c(2, 2, 2, 10),title = "天气相关系数")

## 绘制矩阵散点图分析
smdata <- data.frame(shanghai_day[3:6])
names(smdata) <- c("温度","相对湿度","降雨量","风力")
ggscatmat(smdata) + theme_bw(base_family = "STKaiti") +
theme(plot.title = element_text(hjust = 0.5)) +
labs(x = "",y = "",title = "天气数据")

## 4:通过地图对天气数据进行可视化
## 获取所有城市的信息,并的到数据可视化地图####-------------------------------
citys <- chinaweather %>%
group_by(city,day) %>%
summarise(MeanTemperature = mean(temperature), #平均温度
MeanRelative_humidity = mean(relative_humidity), #平均相对湿度
MeanRainfall = mean(rainfall) * 24, #平均每天降雨量
MeanWind_strong = mean(wind_strong)) %>%#平均风力
arrange(city)
citys$city <- as.character(citys$city)
## 读取数据
citys_geo <- read_csv("各个城市的经纬度.csv")
## Parsed with column specification:
## cols(
## lon = col_double(),
## lat = col_double(),
## city = col_character()
## )
## 查看两个城市的差集
setdiff(unique(citys$city), citys_geo$city)
## [1] "甘南"
citys <- citys[citys$city != setdiff(unique(citys$city), citys_geo$city),]
## 连接天气数据集和坐标数据集
newcitys <- left_join(citys,citys_geo,by = c("city" = "city"))
## 绘制可交互的地图 ------------------------
## 提取一天的数据
newcitys_day <- newcitys[newcitys$day == "2015-12-15",]
## 定义颜色 -------------------------
tempcolor <- colorNumeric(viridis(6,option = "plasma"),newcitys_day$MeanTemperature)
rehucolor <- colorNumeric(viridis(6,option = "plasma"),newcitys_day$MeanRelative_humidity)
raincolor <- colorNumeric(viridis(6,option = "viridis"),newcitys_day$MeanRainfall)
windcolor <- colorNumeric(viridis(6,option = "viridis"),newcitys_day$MeanWind_strong)
## 绘制地图--------------------------------
## 天气数据的地图可视化,展示各个地点的天气状况
## 该图像,每个参数使用颜色和大小进行编码,使用鼠标点击圆,会显示相应的点和参数
map <- leaflet(data = newcitys_day,width = 850, height = 600) %>%
setView(lng = 103,lat = 35 ,zoom = 4) %>%
addTiles() %>%
addPopups(lng = 103, lat = 45, popup = "2015/12/15")%>%
addCircleMarkers(lng = newcitys_day$lon, lat = newcitys_day$lat,
stroke = FALSE,group = "温度",
fillOpacity = 0.8,radius = ~ (8 + newcitys_day$MeanTemperature/4),
popup = ~paste(newcitys_day$city,round(newcitys_day$MeanTemperature,2),sep = "-温度:"),
color = ~tempcolor(newcitys_day$MeanTemperature)) %>%
addCircleMarkers(lng = newcitys_day$lon, lat = newcitys_day$lat,
stroke = FALSE,group = "相对湿度",
fillOpacity = 0.8,radius = ~ (2 + newcitys_day$MeanRelative_humidity/10),
popup = ~paste(newcitys_day$city,round(newcitys_day$MeanRelative_humidity,2),sep = "-相对湿度:"),
color = ~rehucolor(newcitys_day$MeanRelative_humidity)) %>%
addCircleMarkers(lng = newcitys_day$lon, lat = newcitys_day$lat,
stroke = FALSE,group = "降雨量",
fillOpacity = 0.8,radius = ~ (4.5 + newcitys_day$MeanRainfall),
popup = ~paste(newcitys_day$city,round(newcitys_day$MeanRainfall,2),sep = "-降雨量:"),
color = ~raincolor(newcitys_day$MeanRainfall)) %>%
addCircleMarkers(lng = newcitys_day$lon, lat = newcitys_day$lat,
stroke = FALSE,group = "风力",
fillOpacity = 0.8,radius = ~ (2 + newcitys_day$MeanWind_strong),
popup = ~paste(newcitys_day$city,round(newcitys_day$MeanWind_strong,2),sep = "-风力:"),
color = ~windcolor(newcitys_day$MeanWind_strong)) %>%
addLegend("topleft",pal = tempcolor, values = newcitys_day$MeanTemperature,
title ="温度(℃)",opacity = 0.5) %>%
addLegend("topleft",pal = rehucolor, values = newcitys_day$MeanRelative_humidity,
title = "相对湿度",opacity = 0.5) %>%
addLayersControl(baseGroups = c("温度","相对湿度","降雨量","风力"),
options = layersControlOptions(collapsed = FALSE),
position = "topright") %>%
addLegend("topright",pal = raincolor, values = newcitys_day$MeanRainfall,
title = "降雨量(mm)",opacity = 0.5) %>%
addLegend("topright",pal = windcolor, values = newcitys_day$MeanWind_strong,
title = "风力(m/s)",opacity = 0.5)
map
## =======================================================
## 数据预处理
## 获取每个城市12月份每天的天气数据
citys <- chinaweather %>%
group_by(city,day) %>%
summarise(MeanTemperature = mean(temperature), #平均温度
MeanRelative_humidity = mean(relative_humidity), #平均相对湿度
MeanRainfall = mean(rainfall) * 24, #平均每天降雨量
MeanWind_strong = mean(wind_strong))#平均风力
## 清洗数据,剔除甘南这个地方
citys <- citys[citys$city != "甘南",]
## 查看我们都有哪些地点
unique(citys$city)
## [1] "阿坝" "阿克苏" "阿拉尔" "阿拉善盟" "阿勒泰" "阿里"
## [7] "安康" "安庆" "安顺" "安阳" "鞍山" "巴彦淖尔"
## [13] "巴音郭楞" "巴中" "白城" "白沙" "白山" "白银"
## [19] "百色" "蚌埠" "包头" "宝鸡" "保定" "保山"
## [25] "保亭" "北海" "北京" "本溪" "毕节" "滨州"
## [31] "亳州" "博尔塔拉" "沧州" "昌都" "昌吉" "昌江"
## [37] "常德" "常州" "朝阳" "潮州" "郴州" "成都"
## [43] "承德" "澄迈" "池州" "赤峰" "重庆" "崇左"
## [49] "滁州" "楚雄" "达州" "大理" "大连" "大庆"
## [55] "大同" "大兴安岭" "丹东" "儋州" "德宏" "德阳"
## [61] "德州" "迪庆" "定安" "定西" "东方" "东莞"
## [67] "东营" "鄂尔多斯" "鄂州" "恩施" "防城港" "佛山"
## [73] "福州" "抚顺" "抚州" "阜新" "阜阳" "甘孜"
## [79] "赣州" "固原" "广安" "广元" "广州" "贵港"
## [85] "贵阳" "桂林" "果洛" "哈尔滨" "哈密" "海北"
## [91] "海东" "海口" "海南" "海西" "邯郸" "汉中"
## [97] "杭州" "合肥" "和田" "河池" "河源" "菏泽"
## [103] "贺州" "鹤壁" "鹤岗" "黑河" "衡水" "衡阳"
## [109] "红河" "呼和浩特" "呼伦贝尔" "湖州" "葫芦岛" "怀化"
## [115] "淮安" "淮北" "淮南" "黄冈" "黄南" "黄山"
## [121] "黄石" "惠州" "鸡西" "吉安" "吉林" "济南"
## [127] "济宁" "济源" "佳木斯" "嘉兴" "江门" "焦作"
## [133] "揭阳" "金昌" "金华" "锦州" "晋城" "晋中"
## [139] "荆门" "荆州" "景德镇" "九江" "酒泉" "喀什"
## [145] "开封" "克拉玛依" "克州" "昆明" "拉萨" "来宾"
## [151] "莱芜" "兰州" "廊坊" "乐东" "乐山" "丽江"
## [157] "丽水" "连云港" "凉山" "辽阳" "辽源" "聊城"
## [163] "林芝" "临沧" "临汾" "临高" "临夏" "临沂"
## [169] "陵水" "柳州" "六安" "六盘水" "龙岩" "陇南"
## [175] "娄底" "泸州" "吕梁" "洛阳" "漯河" "马鞍山"
## [181] "茂名" "眉山" "梅州" "绵阳" "牡丹江" "那曲"
## [187] "南昌" "南充" "南京" "南宁" "南平" "南通"
## [193] "南阳" "内江" "宁波" "宁德" "怒江" "攀枝花"
## [199] "盘锦" "平顶山" "平凉" "萍乡" "莆田" "濮阳"
## [205] "普洱" "七台河" "齐齐哈尔" "潜江" "黔东南" "黔南"
## [211] "黔西南" "钦州" "秦皇岛" "青岛" "清远" "庆阳"
## [217] "琼海" "琼中" "曲靖" "衢州" "泉州" "日喀则"
## [223] "日照" "三门峡" "三明" "三亚" "厦门" "山南"
## [229] "汕头" "汕尾" "商洛" "商丘" "上海" "上饶"
## [235] "韶关" "邵阳" "绍兴" "深圳" "神农架" "沈阳"
## [241] "十堰" "石河子" "石家庄" "石嘴山" "双鸭山" "朔州"
## [247] "四平" "松原" "苏州" "宿迁" "宿州" "绥化"
## [253] "随州" "遂宁" "塔城" "台州" "太原" "泰安"
## [259] "泰州" "唐山" "天津" "天门" "天水" "铁岭"
## [265] "通化" "通辽" "铜川" "铜陵" "铜仁" "吐鲁番"
## [271] "屯昌" "万宁" "威海" "潍坊" "渭南" "温州"
## [277] "文昌" "文山" "乌海" "乌兰察布" "乌鲁木齐" "无锡"
## [283] "吴忠" "芜湖" "梧州" "五指山" "武汉" "武威"
## [289] "锡林郭勒" "西安" "西宁" "西沙" "西双版纳" "咸宁"
## [295] "咸阳" "仙桃" "湘潭" "湘西" "襄阳" "孝感"
## [301] "忻州" "新乡" "新余" "信阳" "邢台" "兴安盟"
## [307] "徐州" "许昌" "宣城" "雅安" "烟台" "延安"
## [313] "延边" "盐城" "扬州" "阳江" "阳泉" "杨凌"
## [319] "伊春" "伊犁" "益阳" "宜宾" "宜昌" "宜春"
## [325] "银川" "鹰潭" "营口" "永州" "榆林" "玉林"
## [331] "玉树" "玉溪" "岳阳" "云浮" "运城" "枣庄"
## [337] "湛江" "张家界" "张家口" "张掖" "漳州" "长春"
## [343] "长沙" "长治" "昭通" "肇庆" "镇江" "郑州"
## [349] "中山" "中卫" "舟山" "周口" "株洲" "珠海"
## [355] "驻马店" "资阳" "淄博" "自贡" "遵义"
length(unique(citys$city))
## [1] 359
## 一共右359个地点
## 对数据的特征进行准备工作
## 我们一共有一个月的数据,我们的特征可能需要包括,该月四个天气因素的最大值、
## 最小值、均值、标准差等。
citys_weather <- citys %>%
group_by(city) %>%
summarise(MeanTemperature2 = mean(MeanTemperature), #平均温度
MinTemperature = min(MeanTemperature), #最低温度
MaxTemperature = max(MeanTemperature), #最低温度
SdTemperature = sd(MeanTemperature), #温度标准差
MeanRelative_humidity2 = mean(MeanRelative_humidity),#平均相对湿度
MinRelative_humidity = min(MeanRelative_humidity),#最小相对湿
MaxRelative_humidity = max(MeanRelative_humidity), #最大相对湿
SdRelative_humidity = sd(MeanRelative_humidity), #相对湿标准差
MeanRainfall2 = mean(MeanRainfall), #平均每天降雨量
MaxRainfall = max(MeanRainfall), #最大每天降雨量;因为所有的最小降雨量都为0,所以剔除
SdRainfall = sd(MeanRainfall), #每天降雨量标准差
MeanWind_strong2 = mean(MeanWind_strong),#平均风力
MinWind_strong = min(MeanWind_strong),#平均风力
MaxWind_strong = max(MeanWind_strong),#平均风力
SdWind_strong = sd(MeanWind_strong)#平均风力
)
## 对我们的特征数据进行标准化
citys_cluster <- apply(citys_weather[,2:16],2,scale)
## ==============================================================
## ===============k均值聚类算法======================================
## ==============================================================
## 使用k均值聚类算法进行聚类
## 探索由聚类数目k的变化
## 计算聚类的残差平方和
set.seed(1234)
## 计算组内平方和 组间平房和
tot_withinss <- vector()
betweenss <- vector()
kk = 30
for(ii in 1:kk){
k1 <- kmeans(citys_cluster,ii,iter.max = 100,algorithm = "MacQueen")
tot_withinss[ii] <- k1$tot.withinss
betweenss[ii] <- k1$betweenss
}
kmeanvalue <- data.frame(kk = 1:kk,
tot_withinss = tot_withinss,
betweenss = betweenss)
p1 <- ggplot(kmeanvalue,aes(x = kk,y = tot_withinss))+
theme_bw(base_family = "STKaiti") +
geom_point(colour = "red") +
geom_line() +
labs(x = "kmean 聚类个数",y = "value") +
ggtitle("簇内平方和总和")+
theme(plot.title = element_text(hjust = 0.5))
p2 <- ggplot(kmeanvalue,aes(x = kk,y = betweenss))+
theme_bw(base_family = "STKaiti") +
geom_point(colour = "red") +
geom_line() +
labs(x = "kmean 聚类个数",y = "value") +
ggtitle("簇间平方和")+
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(p1,p2,nrow = 2)

## 从上面的k均值聚类图,我们可以发现,我们可以将聚类的个数定为10类,这个类说在5以上都可以
## 因为在10之前,这两个指标变化的比较剧烈,之后变化较为平缓,所以定为10类
## 聚类为k类,查看聚类结果
k = 8 ## 聚类的数目
kcluster <- kmeans(citys_cluster,k,iter.max = 100,algorithm = "MacQueen")
## 输出我们的聚类结果
kcluster
## K-means clustering with 8 clusters of sizes 64, 21, 47, 56, 90, 45, 14, 22
##
## Cluster means:
## MeanTemperature2 MinTemperature MaxTemperature SdTemperature
## 1 0.3576381 0.4393421 0.2715715 -0.41879862
## 2 1.3052462 1.1381350 1.4541089 0.09865897
## 3 0.7623438 0.7372296 0.7624496 -0.38224793
## 4 -1.4734001 -1.6704345 -1.2837192 1.66898621
## 5 -0.3132209 -0.1880822 -0.4338724 -0.50966496
## 6 0.4803326 0.4244698 0.5592737 -0.14687127
## 7 1.7348037 1.5453227 1.8146054 -0.13379968
## 8 -0.9695955 -0.7696567 -1.0630455 0.16299761
## MeanRelative_humidity2 MinRelative_humidity MaxRelative_humidity
## 1 0.45965533 0.9327377 0.02243190
## 2 0.52582301 -0.3582297 0.60376145
## 3 0.84977432 0.1876716 0.63639789
## 4 -0.05294843 0.5710032 -0.22492713
## 5 -0.63514273 -0.9207584 0.03901732
## 6 0.51894233 0.2032209 0.54675735
## 7 0.96531144 0.9444179 0.48195145
## 8 -2.59720112 -1.4758029 -3.01328968
## SdRelative_humidity MeanRainfall2 MaxRainfall SdRainfall
## 1 -0.9393036 -0.3528499 -0.3567487 -0.3704743
## 2 0.2805187 2.1023911 3.0789295 2.8390010
## 3 0.2998509 1.6021797 1.0297305 1.2195072
## 4 -0.6945036 -0.5435679 -0.4762532 -0.5011817
## 5 1.0342239 -0.5917613 -0.5649016 -0.5938365
## 6 0.2815534 -0.2378063 -0.2046940 -0.2082598
## 7 -0.6753684 0.4283716 0.5163096 0.5178990
## 8 -0.7850521 -0.3849028 -0.4876690 -0.4360427
## MeanWind_strong2 MinWind_strong MaxWind_strong SdWind_strong
## 1 -1.03341559 -0.67227417 -1.171789e+00 -1.12930655
## 2 0.52119321 0.22414059 2.820168e-01 0.10182832
## 3 -0.23263230 -0.15949801 -4.548059e-01 -0.46206258
## 4 0.47064732 0.22888869 6.672347e-01 0.71489865
## 5 -0.07989664 -0.30280188 2.467000e-01 0.40195893
## 6 0.08140400 0.07392906 5.037447e-05 -0.02064721
## 7 2.73480694 2.37449968 1.953025e+00 1.60459436
## 8 0.22778315 1.07634545 1.606925e-01 -0.26780269
##
## Clustering vector:
## [1] 8 1 1 8 4 8 1 6 6 5 4 4 4 1 4 1 4 1 3 6 4 1 5 1 6 7 5 4 1 5 5 4 5 8 4
## [36] 7 1 6 5 2 3 1 5 3 6 4 1 3 6 6 1 1 4 4 5 4 4 6 1 1 5 5 7 1 7 2 5 4 5 1
## [71] 7 2 3 4 3 4 5 8 3 5 1 1 2 3 6 3 8 4 4 8 8 7 8 8 5 1 3 6 5 3 2 5 3 5 4
## [106] 4 5 3 1 4 4 6 5 1 6 5 5 6 8 3 6 2 4 3 4 5 5 5 4 6 2 5 2 8 3 5 5 5 5 6
## [141] 3 6 4 5 5 4 8 6 8 3 5 1 5 6 1 5 3 5 1 4 4 5 8 1 5 6 1 5 7 3 6 6 2 1 3
## [176] 1 5 5 5 6 3 1 2 1 4 8 3 1 6 3 3 6 5 1 3 3 1 1 4 5 1 3 2 5 1 4 4 1 1 1
## [211] 6 6 5 5 2 5 7 1 6 3 2 8 5 5 3 7 2 8 2 2 5 5 6 3 2 3 3 2 5 4 1 4 5 5 4
## [246] 5 4 4 6 5 5 4 5 1 4 3 5 5 6 5 5 6 1 4 4 4 5 6 1 1 1 7 4 5 5 3 3 1 5 4
## [281] 4 6 5 6 3 1 6 8 4 1 8 7 1 6 5 6 3 1 5 5 5 5 3 5 5 4 5 5 6 1 4 5 4 6 6
## [316] 2 5 1 4 1 1 1 1 3 5 3 4 3 5 7 8 6 6 3 5 5 7 1 4 8 2 4 3 5 1 3 6 5 2 5
## [351] 7 5 3 3 5 1 5 1 1
##
## Within cluster sum of squares by cluster:
## [1] 274.73868 90.33927 155.94932 412.32102 342.65000 160.86729 171.57558
## [8] 212.03634
## (between_SS / total_SS = 66.1 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
## 查看每一类有多少个地点
table(kcluster$cluster)
##
## 1 2 3 4 5 6 7 8
## 64 21 47 56 90 45 14 22
## 聚类结果可视化,该图因为聚类个数太多所以可以不分析
clusplot(citys_cluster,kcluster$cluster,main = paste("kmean cluster number=",k,sep = ""))

## 表示聚类效果,聚类中的轮廓信息
si1 <- silhouette(kcluster$cluster,dist(citys_cluster,method = "euclidean"))
par(cex = 0.9)
plot(si1,main = "kmean silhouette",col = "red")

## 从聚类中的轮廓信息轮廓信息图,我们可以发现,我们的聚类效果还是很好的
## 每一类中轮廓图中的直方条形越接近于1的越多,说明这些聚类在一族越好。
## 查看哪些城市都氛围了哪一类
city_clu <- data.frame(city = citys_weather$city,
cluster = kcluster$cluster)
city_clu$city <- as.character(city_clu$city)
## 我们将这些类别在地图上展示出来
## 链接数据
## 连接天气数据集和坐标数据集
newcitys <- left_join(city_clu,citys_geo,by = c("city" = "city"))
## 将聚类数据在地图上显示出来
## 定义颜色 -------------------------
clucolor <- colorFactor(viridis(k,option = "viridis"),newcitys$cluster)
## 绘制地图--------------------------------
## k均值聚类结果展示在地图上,用开方便的分析我们的聚类结果
## 该图像,聚类得懂的类别使用颜色进行编码,使用鼠标点击圆,会显示相应的地点和类别
map <- leaflet(data = newcitys,width = 800, height = 600) %>%
setView(lng = 103,lat = 35 ,zoom = 4) %>%
addTiles() %>%
addCircleMarkers(lng = newcitys$lon, lat = newcitys$lat,
stroke = FALSE,group = "类别",
fillOpacity = 0.8,radius = 8.5,
popup = ~paste(newcitys$city,newcitys$cluster,sep = "-类别为:"),
color = ~clucolor(newcitys$cluster)) %>%
addLegend("topleft",pal = clucolor, values = newcitys$cluster,
title ="类别",opacity = 0.5) %>%
addLayersControl(baseGroups = c("K means 聚类结果"),
options = layersControlOptions(collapsed = FALSE),
position = "topright")
map
## ===================================================
## 太原温度数据可视化
## 读取数据
taiyuan <- read_excel("太原.xlsx",col_names = FALSE)
names(taiyuan) <- c("time","MeanTemp")
## 整理数据
## 定义时间数据,时间从12年1月开始,到17年5月结束
taiyuan$time <- as.Date(as.yearmon(2012 + seq(0,64)/12))
taiyuan$year <- year(taiyuan$time)
taiyuan$month <- month(taiyuan$time)
## 设定月份
mymonths <- c("January","February","March","April","May","June","July",
"August","September","October","November","December")
## 等级
month.name <- sort(unique(taiyuan$month))
taiyuan$month2 <- factor(taiyuan$month, levels = month.name,labels = mymonths)
head(taiyuan)
## # A tibble: 6 x 5
## time MeanTemp year month month2
## <date> <dbl> <dbl> <dbl> <fct>
## 1 2012-01-01 -6. 2012. 1. January
## 2 2012-02-01 -3. 2012. 2. February
## 3 2012-03-01 4. 2012. 3. March
## 4 2012-04-01 14. 2012. 4. April
## 5 2012-05-01 20. 2012. 5. May
## 6 2012-06-01 22. 2012. 6. June
## ==================================================================
## 数据可视化
## 温度的热力图
ggplot(data=taiyuan, aes(x=year,y=month2)) +
theme_bw(base_family = "STKaiti") +
geom_tile(aes(fill = MeanTemp),colour = "white") +
geom_text(aes(label = round(MeanTemp,1))) +
scale_fill_gradientn(colours=rev(brewer.pal(10,'Spectral'))) +
theme(legend.title=element_blank(),
axis.title.y=element_blank(),
axis.title.x=element_blank(),
legend.position="top") +
ggtitle("太原每月平均温度") +
labs(x="年份",y = "月份") +
theme(plot.title = element_text(hjust = 0.5))

## 散点图盒子图
ggplot(data =taiyuan,aes(x=month2,y=MeanTemp,color=MeanTemp)) +
theme_bw(base_family = "STKaiti") +
scale_color_gradientn(colours=rev(brewer.pal(10,'Spectral'))) +
geom_boxplot(colour='black',size=.4,alpha=.5) +
geom_jitter(shape=16,width=.2,size=2) +
theme(legend.title=element_blank(),
legend.position='top',
axis.text.x = element_text(angle=45, hjust=1),
plot.title = element_text(hjust = 0.5)) +
scale_y_continuous(breaks = seq(-10,30,5),labels = seq(-10,30,5)) +
ggtitle("太原每月的平均温度") +
xlab('') + ylab('2012~2017温度(℃)')

## 平均气温的曲线图
ggplot(data = taiyuan,aes(x=time,y=MeanTemp)) +
theme_bw(base_family = "STKaiti") +
geom_line() +
geom_point(colour = "red",size = 1) +
scale_x_date(date_breaks = "1 year",date_labels = "%Y") +
xlab('年份') + ylab('温度(℃)') +
ggtitle("太原每月平均温度") +
theme(plot.title = element_text(hjust = 0.5))

## ==============================================================
## ==========使用时间序列分析温度的变化,并进行预测==============
## ==============================================================
## ==============================================================
## ==============================================================
## 定义时间序列数据
tsdata <- ts(data = taiyuan$MeanTemp,start = c(2012,1),frequency = 12)
par(family = "STKaiti")
plot(tsdata,xlab = "时间",ylab = "温度(℃)",main="太原每月平均温度")

## 计算我们的数据个数
length(tsdata)
## [1] 65
## 我们一共有65个数据,我们使用前55个作为模型的训练数据,
## 训练合适的时间序列模型,涌来预测后面的数据
nn = 55
taiyuan[55,]
## # A tibble: 1 x 5
## time MeanTemp year month month2
## <date> <dbl> <dbl> <dbl> <fct>
## 1 2016-07-01 24. 2016. 7. July
tstrain <- ts(taiyuan$MeanTemp[1:nn],start = c(2012,1),frequency = 12)
tstest <- ts(taiyuan$MeanTemp[(nn+1):65],star = c(2016,8),frequency = 12)
## =============================================================================
## 使用新的模型来重新训练季节ARIMA模型
mod2 <- Arima(tstrain,order = c(0,1,1),seasonal = list(order = c(0,1,1)))
mod2
## Series: tstrain
## ARIMA(0,1,1)(0,1,1)[12]
##
## Coefficients:
## ma1 sma1
## -0.8545 -0.9985
## s.e. 0.0951 1.0260
##
## sigma^2 estimated as 1.578: log likelihood=-77.87
## AIC=161.74 AICc=162.37 BIC=166.95
par(family = "STKaiti")
plot(forecast(mod2,h=10),xlab = "时间",ylab = "温度(℃)",shadecols="oldstyle")

## 在ggplot中绘制我们的图像
X <- forecast(mod2,h=10)
X
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Aug 2016 22.8284798 21.027522 24.6294380 20.0741519 25.582808
## Sep 2016 17.8284824 16.008627 19.6483377 15.0452539 20.611711
## Oct 2016 11.8284827 9.989924 13.6670408 9.0166506 14.640315
## Nov 2016 3.5784879 1.721415 5.4355606 0.7383403 6.418636
## Dec 2016 -3.1715056 -5.046910 -1.2961011 -6.0396892 -0.303322
## Jan 2017 -3.9104158 -5.788602 -2.0322301 -6.7828531 -1.037979
## Feb 2017 -0.9104113 -2.806003 0.9851804 -3.8094686 1.988646
## Mar 2017 7.0895835 5.176744 9.0024227 4.1641483 10.015019
## Apr 2017 13.8895869 11.959654 15.8195195 10.9380097 16.841164
## May 2017 19.4895763 17.542700 21.4364522 16.5120865 22.467066
## 构建显得数据集
tempdatatrain <- data.frame(MeanTemp = taiyuan$MeanTemp[1:nn],
time = taiyuan$time[1:nn])
tempdatatrain$class <- "训练数据"
tempdatatest <- data.frame(MeanTemp = taiyuan$MeanTemp[(nn+1):65],
time = taiyuan$time[(nn+1):65])
tempdatatest$class <- "测试数据"
tempdatapre <- data.frame(MeanTemp = as.numeric(X$mean),
time = tempdatatest$time)
tempdatapre$class <- "预测数据"
## 组合数据
plottemp <- rbind(tempdatatrain,tempdatatest,tempdatapre)
## 获得置信区间的数据
X95 <- data.frame(xlower = X$lower[,2],
xupper = X$upper[,2],
time = tempdatatest$time,
MeanTemp = as.numeric(X$mean))
## 绘制图像
titles <- paste("太原:","ARIMA:(",as.character(mod2$call[3]),
str_sub(as.character(mod2$call[4]),14,23),"[12])")
ggplot(plottemp,aes(x = time,y = MeanTemp,
linetype=class,colour = class,shape = class)) +
theme_bw(base_family = "STKaiti")+
geom_line(size = 0.5) +
labs(x = "时间",y = "温度(℃)",title=titles) +
geom_point(size = 1.5) +
scale_x_date(date_breaks = "1 year",date_labels = "%Y") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(legend.position="top",legend.title = element_blank())

##