注意:Species变量在回归时可以被正确地因子化,这里就不专门处理了。
attach(iris)
# 查看数据样式
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
# 画出散点图
pairs(iris)
# 可以观察到各个变量之间大多是聚簇的关系,回归分析不会有好的结果。例如以下回归就没有通过F检验:
lm1 = lm(Sepal.Length ~ Sepal.Width, data = iris)
summary(lm1)
##
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width, data = iris)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.556 -0.633 -0.112 0.558 2.223
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.526 0.479 13.63 <2e-16 ***
## Sepal.Width -0.223 0.155 -1.44 0.15
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.825 on 148 degrees of freedom
## Multiple R-squared: 0.0138, Adjusted R-squared: 0.00716
## F-statistic: 2.07 on 1 and 148 DF, p-value: 0.152
# 但是我们观察到,花瓣长度和花瓣宽度之间有明显的线性关系:
lm2 = lm(Petal.Length ~ Petal.Width, data = iris)
summary(lm2)
##
## Call:
## lm(formula = Petal.Length ~ Petal.Width, data = iris)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3354 -0.3035 -0.0295 0.2578 1.3945
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.0836 0.0730 14.8 <2e-16 ***
## Petal.Width 2.2299 0.0514 43.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.478 on 148 degrees of freedom
## Multiple R-squared: 0.927, Adjusted R-squared: 0.927
## F-statistic: 1.88e+03 on 1 and 148 DF, p-value: <2e-16
# 按照题目要求,将Species变量包括进来,可以发现它对花瓣长度有显著影响:
lm3 = lm(Petal.Length ~ ., data = iris)
summary(lm3)
##
## Call:
## lm(formula = Petal.Length ~ ., data = iris)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7840 -0.1571 0.0019 0.1473 0.6542
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.1110 0.2699 -4.12 6.4e-05 ***
## Sepal.Length 0.6080 0.0502 12.10 < 2e-16 ***
## Sepal.Width -0.1805 0.0804 -2.25 0.026 *
## Petal.Width 0.6022 0.1214 4.96 2.0e-06 ***
## Speciesversicolor 1.4634 0.1735 8.44 3.1e-14 ***
## Speciesvirginica 1.9742 0.2448 8.06 2.6e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.263 on 144 degrees of freedom
## Multiple R-squared: 0.979, Adjusted R-squared: 0.978
## F-statistic: 1.32e+03 on 5 and 144 DF, p-value: <2e-16
# 此外,为了预测鸢尾花的种类,我们进行以下判别分析:
# 由于下面用到的判别函数不能识别字符串因子,所以我们首先创建一个列group取代Species
iris1 = iris
iris1$group[which(Species == "setosa")] = 1
iris1$group[which(Species == "versicolor")] = 2
iris1$group[which(Species == "virginica")] = 3
iris1$group = factor(iris1$group)
# 距离判别
X <- iris1[, 1:4]
G <- iris1$group
source("distinguish.distance.R")
dd = distinguish.distance(X, G)
(prec = rbind(dd, G))
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
## blong 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## G 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
# 判错的点
prec[, which(t(prec)[, 1] != t(prec)[, 2])]
## 71 73 84
## blong 3 3 3
## G 2 2 2
# 贝叶斯判别
source("D:\\Rworkspace\\distinguish.bayes.R")
db = distinguish.bayes(X, G)
(prec = rbind(db, G))
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
## blong 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 2 3 2 2 2 2 3 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## G 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
# 判错的点
prec[, which(t(prec)[, 1] != t(prec)[, 2])]
## 69 71 73 78 84
## blong 3 3 3 3 3
## G 2 2 2 2 2
detach(iris)
longley
## GNP.deflator GNP Unemployed Armed.Forces Population Year Employed
## 1947 83.0 234.3 235.6 159.0 107.6 1947 60.32
## 1948 88.5 259.4 232.5 145.6 108.6 1948 61.12
## 1949 88.2 258.1 368.2 161.6 109.8 1949 60.17
## 1950 89.5 284.6 335.1 165.0 110.9 1950 61.19
## 1951 96.2 329.0 209.9 309.9 112.1 1951 63.22
## 1952 98.1 347.0 193.2 359.4 113.3 1952 63.64
## 1953 99.0 365.4 187.0 354.7 115.1 1953 64.99
## 1954 100.0 363.1 357.8 335.0 116.2 1954 63.76
## 1955 101.2 397.5 290.4 304.8 117.4 1955 66.02
## 1956 104.6 419.2 282.2 285.7 118.7 1956 67.86
## 1957 108.4 442.8 293.6 279.8 120.4 1957 68.17
## 1958 110.8 444.5 468.1 263.7 122.0 1958 66.51
## 1959 112.6 482.7 381.3 255.2 123.4 1959 68.66
## 1960 114.2 502.6 393.1 251.4 125.4 1960 69.56
## 1961 115.7 518.2 480.6 257.2 127.9 1961 69.33
## 1962 116.9 554.9 400.7 282.7 130.1 1962 70.55
# 将GNP.deflator对除了Year以外的所有变量做回归:
lm4 = lm(GNP.deflator ~ . - Year, data = longley)
summary(lm4)
##
## Call:
## lm(formula = GNP.deflator ~ . - Year, data = longley)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0750 -0.4325 0.0848 0.5048 1.5556
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 225.80012 81.87310 2.76 0.0202 *
## GNP 0.22130 0.06090 3.63 0.0046 **
## Unemployed 0.02250 0.00819 2.75 0.0206 *
## Armed.Forces 0.00483 0.00780 0.62 0.5500
## Population -1.70750 0.64475 -2.65 0.0244 *
## Employed -0.27343 0.74614 -0.37 0.7217
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.15 on 10 degrees of freedom
## Multiple R-squared: 0.992, Adjusted R-squared: 0.989
## F-statistic: 263 on 5 and 10 DF, p-value: 2.84e-10
# kappa>1000,有严重的多重共线性:
corr = cor(longley[, -c(1, 6)])
kappa(corr, exact = T)
## [1] 2300
# 从与最小特征值相应特征向量得知,GNP、Population和Employed之间有较为明显的线性关系:
eigen(corr)
## $values
## [1] 3.549454 1.185544 0.251157 0.012302 0.001543
##
## $vectors
## [,1] [,2] [,3] [,4] [,5]
## [1,] -0.5279 0.03392 -0.1787 -0.22631 0.798170
## [2,] -0.3520 -0.61504 0.6651 0.23531 0.008957
## [3,] -0.2355 0.77911 0.5766 0.05604 -0.043886
## [4,] -0.5272 -0.06530 -0.1072 -0.63539 -0.550051
## [5,] -0.5139 0.09642 -0.4263 0.69753 -0.241582
# 进行逐步回归,去掉不需要的变量Armed.Forces和Employed:
lm5 = step(lm4)
## Start: AIC=8.89
## GNP.deflator ~ (GNP + Unemployed + Armed.Forces + Population +
## Year + Employed) - Year
##
## Df Sum of Sq RSS AIC
## - Employed 1 0.18 13.3 7.11
## - Armed.Forces 1 0.50 13.7 7.49
## <none> 13.2 8.89
## - Population 1 9.24 22.4 15.40
## - Unemployed 1 9.94 23.1 15.89
## - GNP 1 17.40 30.6 20.36
##
## Step: AIC=7.11
## GNP.deflator ~ GNP + Unemployed + Armed.Forces + Population
##
## Df Sum of Sq RSS AIC
## - Armed.Forces 1 1.3 14.7 6.62
## <none> 13.4 7.11
## - Population 1 9.7 23.0 13.82
## - Unemployed 1 14.5 27.8 16.86
## - GNP 1 35.2 48.6 25.76
##
## Step: AIC=6.62
## GNP.deflator ~ GNP + Unemployed + Population
##
## Df Sum of Sq RSS AIC
## <none> 14.7 6.62
## - Unemployed 1 13.3 28.0 14.95
## - Population 1 13.3 28.0 14.95
## - GNP 1 48.6 63.2 27.99
summary(lm5)
##
## Call:
## lm(formula = GNP.deflator ~ GNP + Unemployed + Population, data = longley)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.047 -0.682 0.196 0.696 1.435
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 221.12959 48.97251 4.52 0.00071 ***
## GNP 0.22010 0.03493 6.30 3.9e-05 ***
## Unemployed 0.02246 0.00681 3.30 0.00634 **
## Population -1.80501 0.54692 -3.30 0.00634 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.11 on 12 degrees of freedom
## Multiple R-squared: 0.992, Adjusted R-squared: 0.989
## F-statistic: 472 on 3 and 12 DF, p-value: 1.03e-12
# 仍然存在较为严重的多重共线性:
corr2 = cor(longley[, c(2, 3, 5)])
kappa(corr2, exact = T)
## [1] 830.7
# 从主成分分析结果可以看出,前两个主成分已经可以解释94.7%的方差:
pc = princomp(covmat = corr)
summary(pc, loadings = T)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 1.8840 1.0888 0.50116 0.11091 0.0392837
## Proportion of Variance 0.7099 0.2371 0.05023 0.00246 0.0003086
## Cumulative Proportion 0.7099 0.9470 0.99723 0.99969 1.0000000
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## GNP -0.528 -0.179 -0.226 0.798
## Unemployed -0.352 -0.615 0.665 0.235
## Armed.Forces -0.235 0.779 0.577
## Population -0.527 -0.107 -0.635 -0.550
## Employed -0.514 -0.426 0.698 -0.242
# 从原来的数据中计算出2个因子变量:
fa = factanal(longley[, -c(1, 6)], factors = 2, scores = "regression")
factors = fa$scores
# 两个因子变量可以根据其载荷解释为“失业因子”和“就业因子”:
fa$loadings
##
## Loadings:
## Factor1 Factor2
## GNP 0.665 0.744
## Unemployed 0.991
## Armed.Forces -0.125 0.694
## Population 0.743 0.666
## Employed 0.568 0.811
##
## Factor1 Factor2
## SS loadings 2.315 2.143
## Proportion Var 0.463 0.429
## Cumulative Var 0.463 0.892
# 此时多重共线性已经没有了:
corr3 = cor(factors)
kappa(corr3, exact = T)
## [1] 1.018
# 对两个因子做回归,结果非常显著:
lm6 = lm(longley$GNP.deflator ~ factors)
summary(lm6)
##
## Call:
## lm(formula = longley$GNP.deflator ~ factors)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.154 -0.764 0.043 0.995 2.994
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 101.681 0.434 234.1 < 2e-16 ***
## factorsFactor1 7.313 0.451 16.2 5.3e-10 ***
## factorsFactor2 7.786 0.452 17.2 2.4e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.74 on 13 degrees of freedom
## Multiple R-squared: 0.978, Adjusted R-squared: 0.974
## F-statistic: 283 on 2 and 13 DF, p-value: 1.93e-11
# 原来的代码:
top.1000.sites <- read.csv("E:/A 学习/1课程与书目/2014春/Dataguru-机器学习/02/作业/top_1000_sites.tsv",
sep = "\t", stringsAsFactors = FALSE)
# 由下面的各个图可以看出对PageViews和UniqueVisitors的对数进行回归会取得较好的效果
library(ggplot2)
# PageViews和UniqueVisitors的散点图
# ggplot(top.1000.sites, aes(x = PageViews, y = UniqueVisitors)) + geom_point()
# PageViews的密度分布
# ggplot(top.1000.sites, aes(x = PageViews)) + geom_density()
# PageViews的对数密度分布
# ggplot(top.1000.sites, aes(x = log(PageViews))) + geom_density()
# PageViews和UniqueVisitors的对数散点图及对数回归曲线
ggplot(top.1000.sites, aes(x = log(PageViews), y = log(UniqueVisitors))) +
geom_point() + geom_smooth(method = "lm", se = FALSE)
# 对数回归
lm.fit <- lm(log(PageViews) ~ log(UniqueVisitors), data = top.1000.sites)
summary(lm.fit)
##
## Call:
## lm(formula = log(PageViews) ~ log(UniqueVisitors), data = top.1000.sites)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.182 -0.799 -0.074 0.647 5.155
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.8344 0.7520 -3.77 0.00017 ***
## log(UniqueVisitors) 1.3363 0.0457 29.25 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.08 on 998 degrees of freedom
## Multiple R-squared: 0.462, Adjusted R-squared: 0.461
## F-statistic: 856 on 1 and 998 DF, p-value: <2e-16
# 加入HasAdvertising和InEnglish哑变量
lm.fit <- lm(log(PageViews) ~ HasAdvertising + log(UniqueVisitors) +
InEnglish, data = top.1000.sites)
summary(lm.fit)
##
## Call:
## lm(formula = log(PageViews) ~ HasAdvertising + log(UniqueVisitors) +
## InEnglish, data = top.1000.sites)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.428 -0.768 -0.063 0.630 5.413
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.9450 1.1478 -1.69 0.09046 .
## HasAdvertisingYes 0.3060 0.0917 3.34 0.00088 ***
## log(UniqueVisitors) 1.2651 0.0705 17.94 < 2e-16 ***
## InEnglishNo 0.8347 0.2086 4.00 6.8e-05 ***
## InEnglishYes -0.1691 0.2042 -0.83 0.40780
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.07 on 995 degrees of freedom
## Multiple R-squared: 0.48, Adjusted R-squared: 0.478
## F-statistic: 229 on 4 and 995 DF, p-value: <2e-16
# 单独对HasAdvertising回归的结果也非常显著,尽管R squared值很低
lm.fit <- lm(log(PageViews) ~ HasAdvertising, data = top.1000.sites)
summary(lm.fit)
##
## Call:
## lm(formula = log(PageViews) ~ HasAdvertising, data = top.1000.sites)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.753 -1.050 -0.145 0.782 8.329
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.792 0.115 162.83 <2e-16 ***
## HasAdvertisingYes 0.415 0.126 3.29 0.001 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.47 on 998 degrees of freedom
## Multiple R-squared: 0.0107, Adjusted R-squared: 0.00975
## F-statistic: 10.8 on 1 and 998 DF, p-value: 0.00103
# 观察发现该数据集从第101行开始就缺失InEnglish和TLD数据,导致InEnglish在因子化的时候出现'Yes','No'和“”(空字符串)三个因子水平:
top.1000.sites[95:111, 8:9]
## InEnglish TLD
## 95 No com
## 96 No com.cn
## 97 No com
## 98 Yes com
## 99 Yes com
## 100 No com
## 101
## 102
## 103
## 104
## 105
## 106
## 107
## 108
## 109
## 110
## 111
sum(top.1000.sites$InEnglish == "")
## [1] 900
levels(factor(top.1000.sites$InEnglish))
## [1] "" "No" "Yes"
# 这一错误也反映在上面的回归方程中,InEnglishNo和InEnglishYes被作为两个哑变量出现,违背了InEnglish作为二值变量的属性。因此需要纠正数据集的这一错误:
top.1000.sites$InEnglish = factor(top.1000.sites$InEnglish, levels = c("Yes",
"No"))
# 重做回归:
lm.fit <- lm(log(PageViews) ~ HasAdvertising + log(UniqueVisitors) +
InEnglish, data = top.1000.sites)
summary(lm.fit)
##
## Call:
## lm(formula = log(PageViews) ~ HasAdvertising + log(UniqueVisitors) +
## InEnglish, data = top.1000.sites)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.617 -0.986 -0.030 0.712 4.137
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.3684 3.1080 -1.41 0.16309
## HasAdvertisingYes 0.0505 0.3974 0.13 0.89920
## log(UniqueVisitors) 1.4010 0.1684 8.32 6e-13 ***
## InEnglishNo 1.0241 0.2665 3.84 0.00022 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.28 on 96 degrees of freedom
## (900 observations deleted due to missingness)
## Multiple R-squared: 0.437, Adjusted R-squared: 0.42
## F-statistic: 24.9 on 3 and 96 DF, p-value: 5.52e-12
# 由于回归时900个包含缺失值的观测被删除,所以这个回归的效果不出所料并不比之前的更好,但也足够显著,R squared也没有下降多少,只是HasAdvertising变量不再显著。去掉HasAdvertising变量之后模型变得更显著了:
lm.fit <- lm(log(PageViews) ~ log(UniqueVisitors) + InEnglish,
data = top.1000.sites)
summary(lm.fit)
##
## Call:
## lm(formula = log(PageViews) ~ log(UniqueVisitors) + InEnglish,
## data = top.1000.sites)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.659 -0.983 -0.025 0.721 4.095
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.322 3.070 -1.41 0.16247
## log(UniqueVisitors) 1.401 0.168 8.36 4.5e-13 ***
## InEnglishNo 1.020 0.263 3.87 0.00019 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.28 on 97 degrees of freedom
## (900 observations deleted due to missingness)
## Multiple R-squared: 0.437, Adjusted R-squared: 0.425
## F-statistic: 37.6 on 2 and 97 DF, p-value: 7.92e-13
# 再做一次包括HasAdvertising但不包括InEnglish的回归,这个模型是回归效果最好的:
lm.fit <- lm(log(PageViews) ~ log(UniqueVisitors) + HasAdvertising,
data = top.1000.sites)
summary(lm.fit)
##
## Call:
## lm(formula = log(PageViews) ~ log(UniqueVisitors) + HasAdvertising,
## data = top.1000.sites)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.193 -0.765 -0.080 0.639 5.399
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.9738 0.7499 -3.97 7.8e-05 ***
## log(UniqueVisitors) 1.3298 0.0455 29.21 < 2e-16 ***
## HasAdvertisingYes 0.2934 0.0927 3.17 0.0016 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.08 on 997 degrees of freedom
## Multiple R-squared: 0.467, Adjusted R-squared: 0.466
## F-statistic: 437 on 2 and 997 DF, p-value: <2e-16
# 为了提高回归效果,我们试着补全TLD变量,方法是从域名Site中用正则表达式提取第一个句点后面的字符串
(tld = gsub("([^\\.]+\\.)(.*)", "\\2", top.1000.sites$Site, perl = T))
## [1] "com" "com" "com" "com" "org" "com" "com" "com" "com" "com" "com" "com" "com" "com" "com" "com" "com" "com" "com" "com" "com" "com" "com" "com" "com" "com" "com" "com" "co.jp" "com" "com" "com" "com" "com" "com" "com" "com" "com" "com" "com" "com" "com" "co.uk" "tv" "com.br" "com" "org" "com" "com" "com" "com" "com" "com" "com" "com" "ru" "com" "com.br" "com" "com" "com" "com" "com" "com" "com" "com"
## [67] "ru" "com" "com" "com" "com" "com" "co.jp" "ne.jp" "com" "com" "com" "cn" "com" "com" "com" "com" "com" "com" "com" "com" "jp" "ru" "com.br" "com" "com" "co.jp" "ru" "us" "com" "com.cn" "com" "com" "com" "com" "cn" "com" "com" "cn" "com" "com" "jp" "cn" "com" "de" "com" "com" "com" "com" "net" "com" "com.cn" "com" "com" "gov" "co.uk" "org" "net" "com" "com" "com" "com" "net" "com" "com" "com" "com"
## [133] "com" "com" "com" "com" "com" "com" "cn" "com" "com" "de" "com" "net" "com" "co.uk" "com" "com" "com" "fr" "com" "com.cn" "com" "ru" "com" "co.uk" "com" "com" "net" "net" "com" "com" "com" "com" "org" "com" "com" "ne.jp" "com" "com" "com" "com" "com" "com" "com" "cn" "com" "com" "com.cn" "com" "com" "com" "com" "com" "cn" "com" "cn" "com" "net" "com.br" "com" "com" "com" "com" "com" "fr" "com" "com"
## [199] "com" "ne.jp" "com" "com" "com" "com" "org" "gov" "com" "com.br" "com" "jp" "com.br" "com" "com" "com" "com" "com" "com" "ru" "com" "com" "com" "com" "com" "com" "com" "com" "cn" "net" "de" "com" "com" "com" "cn" "com" "com" "or.jp" "com" "com" "com.br" "com" "com" "com" "com" "com" "com" "com" "com.cn" "jp" "fr" "jp" "com" "com" "cn" "com" "de" "com" "com" "jp" "com" "com" "com.br" "co.uk" "com" "com"
## [265] "com" "com" "jp" "com" "com" "com" "pl" "com" "com" "com" "ne.jp" "com" "com" "pl" "com" "com" "com" "com" "com" "ne.jp" "cn" "com" "co.jp" "com" "co.uk" "net" "com" "com" "com" "com" "ru" "com.cn" "pl" "com" "com" "com" "com" "com" "com" "com" "com" "com" "vn" "com" "com" "com" "com" "com" "com" "jp" "com" "pl" "com" "cn" "com" "com" "com" "com" "com" "com" "com" "com" "com" "net" "com" "com"
## [331] "com" "com" "com" "ne.jp" "it" "com" "com" "gov.br" "com" "com" "cn" "fr" "org" "de" "net" "com" "com" "com" "tv" "com" "com.cn" "de" "co.in" "net" "net" "com" "com" "com" "jp" "tv" "net" "tv" "com" "net" "com" "com" "com" "com" "com" "com" "com" "com" "eu" "com" "co.jp" "com" "com" "com" "com" "com" "com" "com" "com" "com" "com" "com" "com" "com" "com" "com" "com" "com" "com" "com" "de" "com"
## [397] "com" "com" "com" "com" "com" "com" "net" "com" "com" "it" "com" "ne.jp" "gov.br" "com" "me" "com" "com" "com" "com" "com" "com" "com" "co.jp" "com" "com" "com" "com" "com" "com" "com" "com" "org" "fr" "com" "cc" "ne.jp" "com" "cn" "org" "com" "com" "com" "fr" "com" "com" "com" "com" "jp" "com" "co.jp" "com" "com" "com" "com" "com" "com" "com" "com.br" "com" "com.br" "com" "com" "com.br" "co.jp" "com" "com"
## [463] "com" "com" "com" "com" "ru" "com" "com" "jp" "com" "com.br" "com" "com" "com" "com" "com" "com" "com" "com" "com" "com" "info" "net" "pl" "com" "net" "pl" "com.br" "org" "com" "com" "cn" "fr" "com" "co.kr" "com" "com" "com" "com" "com" "com" "com" "com" "com" "ru" "com" "com" "co.jp" "info" "com" "es" "com" "com" "com" "me" "org" "com" "com" "com" "com" "com" "com" "com" "com" "gov.br" "com" "com"
## [529] "com" "fr" "cn" "net" "com" "com" "com" "com" "com" "com" "co.kr" "com" "com" "com" "ru" "com" "com" "net" "com" "com" "com" "com" "com.br" "it" "com" "com" "com" "com" "com" "ca" "com" "com" "com" "com" "co.jp" "com" "cn" "com" "com" "pt" "info" "com" "com" "com" "com" "com" "org" "ru" "cn" "com.br" "de" "com" "net" "co.jp" "ru" "com" "com" "org" "fm" "com" "com" "com" "gov" "com" "com.au" "com"
## [595] "com" "com" "com" "com" "com" "com" "cn" "gov" "tv" "com" "com" "com" "net" "com" "com" "jp" "com" "ca" "com" "ru" "com" "com" "com" "com" "com.mx" "net" "com" "com" "com" "com" "com.cn" "com" "com" "com" "com" "com" "com" "com" "net" "com" "com" "ru" "co.kr" "com" "com" "com" "ru" "com" "com" "com" "com" "com.br" "cn" "com" "com" "com" "co.kr" "com" "com" "gov" "com" "jp" "com.br" "ne.jp" "com" "cn"
## [661] "co.uk" "cn" "com" "co.uk" "com" "com" "eu" "org" "com" "com.vn" "jp" "net" "com" "fr" "co.jp" "com" "or.jp" "com" "com" "com" "com" "com" "com.tr" "com" "com" "com" "com.br" "fr" "com" "nl" "com" "com" "com.br" "com" "net" "com" "com" "cn" "com" "com" "co.jp" "de" "com" "cn" "com" "gov.uk" "com" "com" "com" "com" "com" "net" "com" "co.uk" "jp" "com" "com" "com" "com" "net" "ru" "com" "com" "de" "com" "gov"
## [727] "com" "net" "com" "it" "com" "com" "org" "com.cn" "com" "com" "com" "com" "com" "com" "ru" "de" "fr" "com" "com.cn" "com" "co.jp" "de" "com" "com" "com.cn" "com" "com" "co.uk" "com" "com" "it" "com" "com" "de" "com" "com" "ru" "com" "com" "com" "com" "cn" "com" "com" "com" "ru" "com" "com" "com.cn" "com" "com" "com" "com" "nl" "com" "com" "it" "com" "com" "com" "com" "com" "com" "nl" "com" "com.tr"
## [793] "com.au" "com" "com.ar" "de" "com.br" "com" "com" "com.br" "com.tw" "com" "com" "fr" "com" "de" "com" "com" "com" "com" "co.uk" "com.br" "fr" "com.cn" "com" "de" "net" "com" "org" "com" "com" "com.br" "com" "com" "cn" "com" "com" "com" "com" "net" "com" "net" "de" "com" "com" "com" "it" "com" "com" "net" "com" "jp" "com" "com" "com" "com" "com" "com" "de" "com" "net" "org" "com.tw" "com" "com" "com" "com" "com"
## [859] "com" "gov" "com" "cn" "com" "com" "com" "com" "com" "com" "net" "com" "com" "com" "com" "com" "com" "co.uk" "eu" "com" "com" "com" "fr" "com" "com" "de" "com" "de" "com" "com" "com" "com.cn" "net" "com.br" "org" "com" "gov" "com" "com.vn" "com" "ru" "co.jp" "de" "de" "com" "es" "net" "com" "com" "com" "com" "com" "com" "com.br" "com" "jp" "jp" "com" "de" "com" "com" "net" "com" "es" "net" "com"
## [925] "com" "org" "com" "com" "com" "co.jp" "com.br" "com" "com" "com" "com" "co.kr" "com" "com" "com" "com" "com" "com" "fr" "com" "ly" "com" "com" "com" "com" "com" "co.jp" "com" "com" "com" "com" "com" "com" "org" "cz" "com" "com.cn" "com" "com" "com" "com" "com" "com" "com" "com" "ru" "com" "com.br" "com" "com" "com" "net" "com" "com" "com" "pl" "com" "com" "jp" "com.br" "co.in" "com" "com" "com" "com" "com"
## [991] "com" "com" "co.jp" "com" "org" "com.br" "com" "com" "com" "com"
# 验证一下提取的前100个字符串与已有的数据是否一致:
(w = which(tld[1:100] != top.1000.sites$TLD[1:100]))
## [1] 67
top.1000.sites[w, ]
## Rank Site Category UniqueVisitors Reach PageViews HasAdvertising InEnglish TLD
## 67 67 yandex.ru Search Engines 46000000 2.5 1.4e+10 Yes No com
# 发现第67行观测的TLD应该是我们提取的ru,而不是原来数据集中的com,由此可见我们提取的TLD甚至纠正了原来数据集中的一个录入错误,可以信赖
top.1000.sites$TLD = tld
levels(factor(top.1000.sites$TLD))
## [1] "ca" "cc" "cn" "co.in" "co.jp" "co.kr" "co.uk" "com" "com.ar" "com.au" "com.br" "com.cn" "com.mx" "com.tr" "com.tw" "com.vn" "cz" "de" "es" "eu" "fm" "fr" "gov" "gov.br" "gov.uk" "info" "it" "jp" "ly" "me" "ne.jp" "net" "nl" "or.jp" "org" "pl" "pt" "ru" "tv" "us" "vn"
top.1000.sites$TLD = factor(top.1000.sites$TLD)
# 将log(PageViews)对除了Rank、Site、Category(哑变量数达231个,太多)、InEnglish(缺失值900个)外的所有变量做回归:
lm.fit <- lm(log(PageViews) ~ log(UniqueVisitors) + Reach + HasAdvertising +
TLD, data = top.1000.sites)
summary(lm.fit)
##
## Call:
## lm(formula = log(PageViews) ~ log(UniqueVisitors) + Reach + HasAdvertising +
## TLD, data = top.1000.sites)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.959 -0.711 -0.029 0.600 5.525
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.90778 1.23040 -1.55 0.1213
## log(UniqueVisitors) 1.34375 0.06242 21.53 <2e-16 ***
## Reach 0.01871 0.01483 1.26 0.2076
## HasAdvertisingYes 0.25890 0.09075 2.85 0.0044 **
## TLDcc 1.73637 1.23805 1.40 0.1611
## TLDcn -1.81273 0.74064 -2.45 0.0146 *
## TLDco.in 0.22641 1.01185 0.22 0.8230
## TLDco.jp -1.26218 0.75355 -1.67 0.0943 .
## TLDco.kr -0.33921 0.84651 -0.40 0.6887
## TLDco.uk -0.78748 0.77269 -1.02 0.3084
## TLDcom -1.42605 0.71619 -1.99 0.0467 *
## TLDcom.ar 0.29254 1.23807 0.24 0.8133
## TLDcom.au 0.57413 1.01085 0.57 0.5702
## TLDcom.br -1.06803 0.73923 -1.44 0.1488
## TLDcom.cn -1.17775 0.76108 -1.55 0.1221
## TLDcom.mx -0.24791 1.23802 -0.20 0.8413
## TLDcom.tr 0.79067 1.01087 0.78 0.4343
## TLDcom.tw 0.21858 1.01097 0.22 0.8289
## TLDcom.vn 0.27407 1.01093 0.27 0.7864
## TLDcz 2.07709 1.23823 1.68 0.0938 .
## TLDde -0.69532 0.74524 -0.93 0.3510
## TLDes -0.80356 0.92285 -0.87 0.3841
## TLDeu -1.53757 0.92474 -1.66 0.0967 .
## TLDfm -1.78336 1.23801 -1.44 0.1501
## TLDfr -0.32245 0.76117 -0.42 0.6719
## TLDgov -0.73451 0.80207 -0.92 0.3600
## TLDgov.br -0.23054 0.92489 -0.25 0.8032
## TLDgov.uk -0.05553 1.23803 -0.04 0.9642
## TLDinfo -2.26621 0.92327 -2.45 0.0143 *
## TLDit -0.28611 0.81047 -0.35 0.7242
## TLDjp -1.30015 0.75161 -1.73 0.0840 .
## TLDly -1.59594 1.24148 -1.29 0.1989
## TLDme -0.60968 1.01498 -0.60 0.5482
## TLDne.jp -1.67378 0.79078 -2.12 0.0345 *
## TLDnet -1.38981 0.73253 -1.90 0.0581 .
## TLDnl 0.34379 0.92329 0.37 0.7097
## TLDor.jp -1.47664 1.01198 -1.46 0.1449
## TLDorg -1.55058 0.75076 -2.07 0.0392 *
## TLDpl 0.80552 0.81060 0.99 0.3206
## TLDpt 0.00432 1.23801 0.00 0.9972
## TLDru -0.79077 0.74822 -1.06 0.2908
## TLDtv -1.68671 0.84626 -1.99 0.0465 *
## TLDus -2.67844 1.23997 -2.16 0.0310 *
## TLDvn 0.70697 1.23828 0.57 0.5682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.01 on 956 degrees of freedom
## Multiple R-squared: 0.551, Adjusted R-squared: 0.531
## F-statistic: 27.3 on 43 and 956 DF, p-value: <2e-16
# Reach变量和TLD中的大部分哑变量都不显著,于是我们仍然只采用log(UniqueVisitors)和HasAdvertising作为解释变量
lm.fit <- lm(log(PageViews) ~ log(UniqueVisitors) + HasAdvertising,
data = top.1000.sites)
summary(lm.fit)
##
## Call:
## lm(formula = log(PageViews) ~ log(UniqueVisitors) + HasAdvertising,
## data = top.1000.sites)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.193 -0.765 -0.080 0.639 5.399
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.9738 0.7499 -3.97 7.8e-05 ***
## log(UniqueVisitors) 1.3298 0.0455 29.21 < 2e-16 ***
## HasAdvertisingYes 0.2934 0.0927 3.17 0.0016 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.08 on 997 degrees of freedom
## Multiple R-squared: 0.467, Adjusted R-squared: 0.466
## F-statistic: 437 on 2 and 997 DF, p-value: <2e-16
# 还可以用爬虫程序获取网站所用语言,从而补全InEnglish数据。由于网站源代码大部分都没有标明语言属性lang,这里就不做了。
# library(RCurl) library(XML)
# urls=paste('www.',top.1000.sites$Site,sep='')
# lang=logical(length(urls)) for(i in 1:length(urls)) { try { page <- getURL(urls[i], ssl.verifypeer = FALSE) page_parsed <- htmlParse(page, asText=T) lang[i]=(xpathSApply(page_parsed, '//@lang')=='en') } }