barplot(apply(data,1,mean))#按行做均值条形图
多元数据直观表示+线性回归分析
1 多元数据直观表示
1.1 各省消费项目均值条形图
省份过多,各省的名称均不能全部显示
将横轴左边旋转90度,各省的名称均可显示
barplot(apply(data,1,mean),las=3)#按行做均值条形图
利用ggplot2包作图较为美观
%>%
data mutate(Average_Consumption = rowMeans(select(., -1), na.rm = TRUE)) %>%
ggplot(aes(x = reorder(row.names(data), -Average_Consumption), y = Average_Consumption)) +
geom_bar(stat = "identity", position = position_dodge(), colour = "black", fill = "steelblue") +
labs(title = "各省消费项目均值条形图", x = "", y = "均值") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
1.2 各消费项目均值条形图
按消费项目做均值图条形图
barplot(apply(data,2,mean))#按列做均值图条形图
对不同项目的条形添加不同颜色
barplot(apply(data,2,mean),col=1:8) #按列做彩色均值图条形图
去掉食品列后的数据按列做均值条形图
barplot(apply(data[,2:8],2,mean))
按消费项目做中位数条形图
barplot(apply(data,2,median))
利用ggplot作均值条形图
%>% summarise(across(everything(), mean, na.rm = TRUE)) %>%
data pivot_longer(cols = everything(), names_to = "Consumption_Type", values_to = "Average") %>%
mutate(
Consumption_Type=factor(Consumption_Type,level=c('食品','衣着','设备','医疗','交通','教育','居住','杂项')),
%>%
) ggplot(aes(x = Consumption_Type, y = Average, fill = Consumption_Type)) +
geom_bar(stat = "identity", position = position_dodge(), colour = "black") +
theme_minimal() +
labs(title = "各消费项目均值条形图", x = "类别", y = "均值",fill = "消费种类")
Warning: There was 1 warning in `summarise()`.
ℹ In argument: `across(everything(), mean, na.rm = TRUE)`.
Caused by warning:
! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
Supply arguments directly to `.fns` through an anonymous function instead.
# Previously
across(a:b, mean, na.rm = TRUE)
# Now
across(a:b, \(x) mean(x, na.rm = TRUE))
使各条形的颜色相同
%>% summarise(across(everything(), mean, na.rm = TRUE)) %>%
data pivot_longer(cols = everything(), names_to = "Consumption_Type", values_to = "Average") %>%
mutate(
Consumption_Type=factor(Consumption_Type,level=c('食品','衣着','设备','医疗','交通','教育','居住','杂项')),
%>%
) ggplot(aes(x = Consumption_Type, y = Average)) +
geom_bar(stat = "identity", position = position_dodge(), colour = "black", fill = "steelblue") +
theme_minimal() +
labs(title = "各消费项目均值条形图", x = "类别", y = "均值")
1.3 各消费项目箱线图
boxplot函数直接作箱线图,默认每个变量(列)作一个箱线,并将全部变量的箱线在同一个图中展示。
boxplot(data)#按列做箱线图
boxplot(data,horizontal=T,las=1)#箱线图中图形按水平放置
利用ggplot函数作箱线图,需要对数据转化为长结果数据
%>% pivot_longer(cols = 1:8, names_to = "Consumption_Type", values_to = "Value") %>%
data mutate(
Consumption_Type=factor(Consumption_Type,level=c('食品','衣着','设备','医疗','交通','教育','居住','杂项')),
%>%
) ggplot(aes(x = Consumption_Type, y = Value)) +
geom_boxplot() +
labs(title = "各消费项目箱线图", x = "", y = "消费水平") +
theme_minimal() # + coord_flip()
1.4 各消费项目星相图
星相图的每个角表示一个变量,可以得出,,北京,上海,广东,浙江四个地区的消费情况较为突出,其他地区的消费情况大致相同
stars(data) #具有图例的360度星相图
stars(data,key.loc=c(17,7)) #具有图例的360度星相图
stars(data,full=F,key.loc=c(17,7)) #具有图例的180度星相图
stars(data,draw.segments=T,key.loc=c(17,7))#具有图例的360度彩色圆形星相图
stars(data,full=F,draw.segments=T,key.loc=c(17,7))#具有图例的180度彩色圆形星相图
1.5 各消费项目脸谱图
脸谱图较为生动形象,直观地表达样本之间的差异
library(aplpack) #加载aplpack包
::faces(data) aplpack
effect of variables:
modified item Var
"height of face " "食品"
"width of face " "衣着"
"structure of face" "设备"
"height of mouth " "医疗"
"width of mouth " "交通"
"smiling " "教育"
"height of eyes " "居住"
"width of eyes " "杂项"
"height of hair " "食品"
"width of hair " "衣着"
"style of hair " "设备"
"height of nose " "医疗"
"width of nose " "交通"
"width of ear " "教育"
"height of ear " "居住"
::faces(data[,2:8])#去掉第一个变量按每行7个做脸谱图 aplpack
effect of variables:
modified item Var
"height of face " "衣着"
"width of face " "设备"
"structure of face" "医疗"
"height of mouth " "交通"
"width of mouth " "教育"
"smiling " "居住"
"height of eyes " "杂项"
"width of eyes " "衣着"
"height of hair " "设备"
"width of hair " "医疗"
"style of hair " "交通"
"height of nose " "教育"
"width of nose " "居住"
"width of ear " "杂项"
"height of ear " "衣着"
::faces(data[c(1,9,19,28,29,30),])#选择第1,9,19,28,29,30个观测的多元数据做脸谱图 aplpack
effect of variables:
modified item Var
"height of face " "食品"
"width of face " "衣着"
"structure of face" "设备"
"height of mouth " "医疗"
"width of mouth " "交通"
"smiling " "教育"
"height of eyes " "居住"
"width of eyes " "杂项"
"height of hair " "食品"
"width of hair " "衣着"
"style of hair " "设备"
"height of nose " "医疗"
"width of nose " "交通"
"width of ear " "教育"
"height of ear " "居住"
library("TeachingDemos") #install.packages("TeachingDemos")
faces2(data,ncols=7) #TeachingDemos::faces(data)
1.6 各消费项目雷达图
ggplot2的扩展包ggiraphExtra能作雷达图
c(1,9,19,28,29,30),] %>%
data[mutate(省份=rownames(.)) %>%
ggRadar(aes(group = 省份))
1.7 各消费项目调和曲线图
source("msaR.R")#加自定义函数
msa.andrews(data)#绘制调和曲线图
msa.andrews(data[c(1,9,19,28,29,30),])
library("fmsb")
=data[c(1,9,19,28,29,30),]
rddat=rbind(apply(rddat,2,max),apply(rddat,2,min))
maxmin=rbind(maxmin,rddat)
rddatradarchart(rddat, axistype=2, pcol=topo.colors(6), plty=1, pdensity=seq(5,40,by=5), pangle=seq(0,150,by=30), pfcol=topo.colors(6))
#install.packages("andrews")
library(andrews)
Warning: package 'andrews' was built under R version 4.3.3
See the package vignette with `vignette("andrews")`
andrews(data,clr=5,ymax=6)
#选择第1,9,19,28,29,30个观测的多元数据做调和曲线图
andrews(data[c(1,9,19,28,29,30),],clr=5,ymax=6)
2 线性回归分析
## 一元线性回归2-1
每周加班工作时间(x)与签发新保单数目(y)呈明显正相关
每周加班工作时间(x)与签发新保单数目(y)的相关系数为0.95 。
利用每周加班工作时间(x)对签发新保单数目(y)作回归,回归方程为
\[ \widehat{y} =46.15+251.17\times x\]
随机误差\(\epsilon\)的标准差\(\sigma\)的估计值为127.06 5.x与y的决定系数
R2=summary(model_fit)$r.sq) #显示多元线性回归模型决定系数 (
[1] 0.9005
6.对回归方程做方差分析
anova(model_fit)#模型方差分析
Analysis of Variance Table
Response: y
Df Sum Sq Mean Sq F value Pr(>F)
x 1 1168713 1168713 72.4 2.8e-05 ***
Residuals 8 129147 16143
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
7.对回归方程作残差图并做一些分析
plot(model_fit)#做散点图
8.根据\[ \widehat{y} =46.15+251.17\times x\]方程,带入y=1000,可以计算出需要的加班时间为3.798个小时
2.1 多元线性回归2-2
利用广告预算(x1)和销售代理数目(x2)对年销售额(y)作回归,回归方程为:
term estimate std.error statistic p.value (Intercept) -22.74 30.69 -0.74 0.49 x1 0.15 0.11 1.33 0.24 x2 1.22 1.31 0.93 0.40 \[ \widehat{y} =-22.74+0.15\times x1+1.22\times x2\]
5%显著水平下,广告预算(x1)和销售代理数目(x2)的系数均不显著。
广告预算(x1)与销售额(y)相关系数为0.5797;销售代理数目(x2)与销售额(y)相关系数为0.4816 ;广告预算(x1)和销售代理数目(x2)与年销售额(y)的复相关系数为0.6586 。
2.2 多元线性回归2-3
从回归方程中可知
term estimate std.error statistic p.value (Intercept) -5213.1 12704.5 -0.41 0.70 GPA 8508.8 2721.6 3.13 0.03 年龄 181.6 283.5 0.64 0.55 \[ \widehat{起始工资} =-5213.12+8508.79\times GPA+181.58\times 年龄\]
含义
summary(model_fit3)
Call:
lm(formula = 起始工资 ~ ., data = data4)
Residuals:
1 2 3 4 5 6 7 8
1617 207 1282 -704 -2215 -770 -311 893
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -5213 12704 -0.41 0.699
GPA 8509 2722 3.13 0.026 *
年龄 182 284 0.64 0.550
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1500 on 5 degrees of freedom
Multiple R-squared: 0.668, Adjusted R-squared: 0.535
F-statistic: 5.02 on 2 and 5 DF, p-value: 0.0637
- 当GPA=3,年龄=24,起始工资的预测值为2.4671^{4} 。
2.3 线性模型选择2-4
货运总量(y)、工业总产值(x1)、农业总产值(x2)、居民非商品支出(x3)的相关系数矩阵为:
y x1 x2 x3 y 1.000 0.556 0.731 0.724 x1 0.556 1.000 0.113 0.398 x2 0.731 0.113 1.000 0.547 x3 0.724 0.398 0.547 1.000
散点图矩阵为:
回归方程为:
\[ \widehat{y} =-348.28+3.75\times x1+7.1\times x2+12.45\times x3\]
回归模型的R方为:0.8055 ,说明 拟合优度好
回归模型F检验说明 ,t检验说明p值小于0.05,回归显著
Call: lm(formula = y ~ ., data = data5) Residuals: Min 1Q Median 3Q Max -25.20 -17.03 2.63 11.68 33.23 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -348.28 176.46 -1.97 0.096 . x1 3.75 1.93 1.94 0.100 x2 7.10 2.88 2.47 0.049 * x3 12.45 10.57 1.18 0.284 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 23.4 on 6 degrees of freedom Multiple R-squared: 0.806, Adjusted R-squared: 0.708 F-statistic: 8.28 on 3 and 6 DF, p-value: 0.0149
剔除不显著的 后,回归模型为
$$ \widehat{y} =-348.28+3.75\times x1+7.1\times x2+12.45\times x3$$
逐步回归的选择的模型为y=-348.28+3.75x1+7.10x2+12.45x3
Start: AIC=65.98 y ~ x1 + x2 + x3 Df Sum of Sq RSS AIC <none> 3297 66.0 - x3 1 762 4059 66.1 - x1 1 2072 5369 68.9 - x2 1 3340 6637 71.0
Call: lm(formula = y ~ x1 + x2 + x3, data = data5) Coefficients: (Intercept) x1 x2 x3 -348.28 3.75 7.10 12.45