barplot(apply(data,1,mean))#按行做均值条形图多元数据直观表示+线性回归分析
1 多元数据直观表示
1.1 各省消费项目均值条形图
省份过多,各省的名称均不能全部显示
将横轴左边旋转90度,各省的名称均可显示
barplot(apply(data,1,mean),las=3)#按行做均值条形图利用ggplot2包作图较为美观
data %>%
mutate(Average_Consumption = rowMeans(select(., -1), na.rm = TRUE)) %>%
ggplot(aes(x = reorder(row.names(data), -Average_Consumption), y = Average_Consumption)) +
geom_bar(stat = "identity", position = position_dodge(), colour = "black", fill = "steelblue") +
labs(title = "各省消费项目均值条形图", x = "", y = "均值") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) 1.2 各消费项目均值条形图
按消费项目做均值图条形图
barplot(apply(data,2,mean))#按列做均值图条形图对不同项目的条形添加不同颜色
barplot(apply(data,2,mean),col=1:8) #按列做彩色均值图条形图去掉食品列后的数据按列做均值条形图
barplot(apply(data[,2:8],2,mean))按消费项目做中位数条形图
barplot(apply(data,2,median))利用ggplot作均值条形图
data %>% summarise(across(everything(), mean, na.rm = TRUE)) %>%
pivot_longer(cols = everything(), names_to = "Consumption_Type", values_to = "Average") %>%
mutate(
Consumption_Type=factor(Consumption_Type,level=c('食品','衣着','设备','医疗','交通','教育','居住','杂项')),
) %>%
ggplot(aes(x = Consumption_Type, y = Average, fill = Consumption_Type)) +
geom_bar(stat = "identity", position = position_dodge(), colour = "black") +
theme_minimal() +
labs(title = "各消费项目均值条形图", x = "类别", y = "均值",fill = "消费种类")Warning: There was 1 warning in `summarise()`.
ℹ In argument: `across(everything(), mean, na.rm = TRUE)`.
Caused by warning:
! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
Supply arguments directly to `.fns` through an anonymous function instead.
# Previously
across(a:b, mean, na.rm = TRUE)
# Now
across(a:b, \(x) mean(x, na.rm = TRUE))
使各条形的颜色相同
data %>% summarise(across(everything(), mean, na.rm = TRUE)) %>%
pivot_longer(cols = everything(), names_to = "Consumption_Type", values_to = "Average") %>%
mutate(
Consumption_Type=factor(Consumption_Type,level=c('食品','衣着','设备','医疗','交通','教育','居住','杂项')),
) %>%
ggplot(aes(x = Consumption_Type, y = Average)) +
geom_bar(stat = "identity", position = position_dodge(), colour = "black", fill = "steelblue") +
theme_minimal() +
labs(title = "各消费项目均值条形图", x = "类别", y = "均值")1.3 各消费项目箱线图
boxplot函数直接作箱线图,默认每个变量(列)作一个箱线,并将全部变量的箱线在同一个图中展示。
boxplot(data)#按列做箱线图boxplot(data,horizontal=T,las=1)#箱线图中图形按水平放置利用ggplot函数作箱线图,需要对数据转化为长结果数据
data %>% pivot_longer(cols = 1:8, names_to = "Consumption_Type", values_to = "Value") %>%
mutate(
Consumption_Type=factor(Consumption_Type,level=c('食品','衣着','设备','医疗','交通','教育','居住','杂项')),
) %>%
ggplot(aes(x = Consumption_Type, y = Value)) +
geom_boxplot() +
labs(title = "各消费项目箱线图", x = "", y = "消费水平") +
theme_minimal() # + coord_flip() 1.4 各消费项目星相图
full=F指定星象图为360度星象图
stars(data) #具有图例的360度星相图 stars(data,key.loc=c(17,7)) #具有图例的360度星相图 具有图例的360度彩色圆形星相图
stars(data,draw.segments=T,key.loc=c(17,7))full=F指定星象图为180度星象图,具有图例的180度星相图
具有图例的180度彩色圆形星相图
stars(data,full=F,key.loc=c(17,7)) stars(data,full=F,draw.segments=T,key.loc=c(17,7))星相图的每一个角表示一个变量,北京、上海、广东、浙江四个地区的消费情况比较突出,其他地区的消费情况大致相同。
1.5 各消费项目脸谱图
library(aplpack) #加载aplpack包
faces(d3.1) faces(d3.1[,2:8],ncol.plot=7)#去掉第一个变量按每行7个做脸谱图
faces(d3.1[c(1,9,19,28,29,30),])#选择第1,9,19,28,29,30个观测的多元数据做脸谱图
library(“TeachingDemos”) #install.packages(“TeachingDemos”)
faces2(d3.1,ncols=7) #TeachingDemos::faces(d3.1)
d3.2=read.xlsx('C:/Users/Administrator/Desktop/mvstats5.xlsx','d3.1',rowNames=T);d3.2 食品 衣着 设备 医疗 交通 教育 居住 杂项
北京 4934 1512.9 981.1 1294.1 2328.5 2384.0 1246.2 649.7
天津 4249 1024.2 760.6 1164.0 1309.9 1639.8 1417.5 463.6
河北 2790 975.9 546.8 833.5 1010.5 895.1 917.2 266.2
山西 2600 1064.6 477.7 640.2 1028.0 1054.0 991.8 245.1
内蒙古 2825 1396.9 561.7 719.1 1123.8 1245.1 941.8 468.2
辽宁 3560 1017.6 439.3 879.1 1033.4 1052.9 1047.0 400.2
吉林 2843 1127.1 407.4 854.8 873.9 997.8 1062.5 394.3
黑龙江 2633 1021.5 355.7 729.5 746.0 938.2 784.5 310.7
上海 6125 1330.0 959.5 857.1 3153.7 2653.7 1412.1 763.8
江苏 3929 990.0 707.3 689.4 1303.0 1699.3 1020.1 377.4
浙江 4893 1406.2 666.0 859.1 2473.4 2158.3 1168.1 467.5
安徽 3384 906.5 465.7 554.4 891.4 1170.0 850.2 309.3
福建 4296 940.7 645.4 502.4 1606.9 1426.3 1261.2 376.0
江西 3193 915.1 587.4 385.9 733.0 973.4 728.8 294.6
山东 3181 1238.3 661.0 708.6 1333.6 1191.2 1027.6 325.6
河南 2707 1053.1 549.1 626.5 858.3 936.5 795.4 300.2
湖北 3456 1046.6 550.2 525.3 903.0 1120.3 857.0 242.8
湖南 3244 1017.6 603.2 668.5 986.9 1285.2 869.6 315.8
广东 5057 814.6 853.2 752.5 2966.1 1994.9 1444.9 454.1
广西 3398 656.7 491.0 542.1 932.9 1050.0 803.0 277.4
海南 3547 452.9 520.0 503.8 1401.9 837.8 819.0 210.8
重庆 3674 1171.2 706.8 749.5 1118.8 1237.3 968.5 264.0
四川 3580 949.7 562.0 511.8 1074.9 1031.8 690.3 291.3
贵州 3122 910.3 463.6 354.5 895.0 1036.0 718.6 258.2
云南 3562 859.6 280.6 631.7 1034.7 705.5 673.1 174.2
西藏 3837 880.1 271.3 272.8 866.3 441.0 628.4 335.7
陕西 3064 910.3 513.1 678.4 866.8 1230.7 831.3 332.8
甘肃 2824 939.9 505.2 564.2 861.5 1058.7 768.3 353.6
青海 2803 898.5 484.7 613.2 785.3 953.9 641.9 331.4
宁夏 2761 994.5 480.8 646.0 859.0 863.4 910.7 302.2
新疆 2761 1183.7 475.2 598.8 890.3 896.8 737.0 331.8
aplpack::faces(d3.2)effect of variables:
modified item Var
"height of face " "食品"
"width of face " "衣着"
"structure of face" "设备"
"height of mouth " "医疗"
"width of mouth " "交通"
"smiling " "教育"
"height of eyes " "居住"
"width of eyes " "杂项"
"height of hair " "食品"
"width of hair " "衣着"
"style of hair " "设备"
"height of nose " "医疗"
"width of nose " "交通"
"width of ear " "教育"
"height of ear " "居住"
去掉第一个变量按每行7个做脸谱图
aplpack::faces(d3.2[,2:8],ncol.plot=7)effect of variables:
modified item Var
"height of face " "衣着"
"width of face " "设备"
"structure of face" "医疗"
"height of mouth " "交通"
"width of mouth " "教育"
"smiling " "居住"
"height of eyes " "杂项"
"width of eyes " "衣着"
"height of hair " "设备"
"width of hair " "医疗"
"style of hair " "交通"
"height of nose " "教育"
"width of nose " "居住"
"width of ear " "杂项"
"height of ear " "衣着"
选择第1,9,19,28,29,30个观测的多元数据做脸谱图
aplpack::faces(d3.2[c(1,9,19,28,29,30),])effect of variables:
modified item Var
"height of face " "食品"
"width of face " "衣着"
"structure of face" "设备"
"height of mouth " "医疗"
"width of mouth " "交通"
"smiling " "教育"
"height of eyes " "居住"
"width of eyes " "杂项"
"height of hair " "食品"
"width of hair " "衣着"
"style of hair " "设备"
"height of nose " "医疗"
"width of nose " "交通"
"width of ear " "教育"
"height of ear " "居住"
黑白脸谱图
TeachingDemos::faces(data)脸谱图的差异能直观地看出样本地区之间各项目的差异特性。
1.6 各消费项目雷达图
ggplot2的扩展包ggiraphExtra能作雷达图
data[c(1,9,19,28,29,30),] %>%
mutate(省份=rownames(.)) %>%
ggRadar(aes(group = 省份)) 1.7 各消费项目调和曲线图
#加自定义函数#绘制调和曲线图
source("C:/Users/Administrator/Desktop/msaR.R")#加自定义函数
msa.andrews(data)选择第1,9,19,28,29,30个观测的多元数据
msa.andrews(data[c(1,9,19,28,29,30),])library("fmsb")
rddat=data[c(1,9,19,28,29,30),]
maxmin=rbind(apply(rddat,2,max),apply(rddat,2,min))
rddat=rbind(maxmin,rddat)
radarchart(rddat, axistype=2, pcol=topo.colors(6), plty=1, pdensity=seq(5,40,by=5), pangle=seq(0,150,by=30), pfcol=topo.colors(6)) library(andrews) Warning: package 'andrews' was built under R version 4.3.3
See the package vignette with `vignette("andrews")`
andrews(data,clr=5,ymax=6)选择第1,9,19,28,29,30个观测的多元数据做调和曲线图
andrews(data[c(1,9,19,28,29,30),],clr=5,ymax=6)2 线性回归分析
2.1 一元线性回归2-1
每周加班工作时间(x)与签发新保单数目(y)呈明显正相关
每周加班工作时间(x)与签发新保单数目(y)的相关系数为0.95 。
利用每周加班工作时间(x)对签发新保单数目(y)作回归,回归方程为
\[ \widehat{y} =46.15+251.17\times x\]
随机误差\(\epsilon\)的标准差\(\sigma\)的估计值为127.06
2.2 多元线性回归2-2
利用广告预算(x1)和销售代理数目(x2)对年销售额(y)作回归,回归方程为:
term estimate std.error statistic p.value (Intercept) -22.74 30.69 -0.74 0.49 x1 0.15 0.11 1.33 0.24 x2 1.22 1.31 0.93 0.40 \[ \widehat{y} =-22.74+0.15\times x1+1.22\times x2\]
5%显著水平下,广告预算(x1)和销售代理数目(x2)的系数均不显著。
广告预算(x1)与销售额(y)相关系数为0.5797;销售代理数目(x2)与销售额(y)相关系数为0.4816 ;广告预算(x1)和销售代理数目(x2)与年销售额(y)的复相关系数为0.6586 。
2.3 多元线性回归2-3
从回归方程中可知
term estimate std.error statistic p.value (Intercept) -5213.1 12704.5 -0.41 0.70 GPA 8508.8 2721.6 3.13 0.03 年龄 181.6 283.5 0.64 0.55 \[ \widehat{起始工资} =-5213.12+8508.79\times GPA+181.58\times 年龄\]
含义:当GPA增加一个单位,起始工资平均增加85808.79个单位,当年龄增加一个单位,起始工资平均增加181.58个单位
当GPA=3,年龄=24,起始工资的预测值为2.4671^{4} 。
2.4 线性模型选择2-4
货运总量(y)、工业总产值(x1)、农业总产值(x2)、居民非商品支出(x3)的相关系数矩阵为:
y x1 x2 x3 y 1.000 0.556 0.731 0.724 x1 0.556 1.000 0.113 0.398 x2 0.731 0.113 1.000 0.547 x3 0.724 0.398 0.547 1.000散点图矩阵为:
回归方程为:
\[ \widehat{y} =-348.28+3.75\times x1+7.1\times x2+12.45\times x3\]
回归模型的R方为:0.8055 ,
说明回归模型的R方为:0.8055 ,说明该模型的拟合效果好
回归模型F检验说明 ,t检验说明
回归模型F检验说明F值为8.28,p值为0.0149小于0.05,说明在0.05显著性水平下显著
t检验说明x2的p值小于0.05,说明x2对y的拟合效果显著,x1和x3的p值大于0.05不显著
Call: lm(formula = y ~ x1 + x2 + x3, data = data) Residuals: Min 1Q Median 3Q Max -25.20 -17.03 2.63 11.68 33.23 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -348.28 176.46 -1.97 0.096 . x1 3.75 1.93 1.94 0.100 x2 7.10 2.88 2.47 0.049 * x3 12.45 10.57 1.18 0.284 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 23.4 on 6 degrees of freedom Multiple R-squared: 0.806, Adjusted R-squared: 0.708 F-statistic: 8.28 on 3 and 6 DF, p-value: 0.0149剔除不显著的X1和X3后,回归模型为
Start: AIC=70.72 y ~ x2 Df Sum of Sq RSS AIC <none> 7903 70.7 - x2 1 9049 16953 76.4逐步回归的选择的模型为
Start: AIC=65.98 y ~ x1 + x2 + x3 Df Sum of Sq RSS AIC <none> 3297 66.0 - x3 1 762 4059 66.1 - x1 1 2072 5369 68.9 - x2 1 3340 6637 71.0Call: lm(formula = y ~ x1 + x2 + x3, data = data) Coefficients: (Intercept) x1 x2 x3 -348.28 3.75 7.10 12.45