mpg1 <- read.csv("mpg1.csv", stringsAsFactors = F)
t.test(data = mpg1, cty ~ trans)
##
## Welch Two Sample t-test
##
## data: cty by trans
## t = -4.5375, df = 132.32, p-value = 1.263e-05
## alternative hypothesis: true difference in means between group auto and group manual is not equal to 0
## 95 percent confidence interval:
## -3.887311 -1.527033
## sample estimates:
## mean in group auto mean in group manual
## 15.96815 18.67532
-> cty 평균거리는 자동식이 15.91마일, 수동식이 18.68마일이고 유의수준이 0.05보다 작아 통계적으로 유의미한 차이가 있기 때문에 수동식의 평균이 자동식의 평균보다 약 2.7마일 길다고 할 수 있다.
table(mpg1$trans,mpg1$drv)
##
## 4 f r
## auto 75 65 17
## manual 28 41 8
prop.table(table(mpg1$trans,mpg1$drv),1)
##
## 4 f r
## auto 0.4777070 0.4140127 0.1082803
## manual 0.3636364 0.5324675 0.1038961
– 방법 1
chisq.test(mpg1$trans,mpg1$drv)
##
## Pearson's Chi-squared test
##
## data: mpg1$trans and mpg1$drv
## X-squared = 3.1368, df = 2, p-value = 0.2084
– 방법 2
chisq.test(table(mpg1$trans,mpg1$drv))
##
## Pearson's Chi-squared test
##
## data: table(mpg1$trans, mpg1$drv)
## X-squared = 3.1368, df = 2, p-value = 0.2084
– 방법 3
summary(table(mpg1$trans,mpg1$drv))
## Number of cases in table: 234
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 3.1368, df = 2, p-value = 0.2084
-> p-value = 0.2084 > 0.05 이므로 trans에 따라 drv의 차이가 없다.(귀무가설)
cor.test(mpg1$cty,mpg1$hwy)
##
## Pearson's product-moment correlation
##
## data: mpg1$cty and mpg1$hwy
## t = 49.585, df = 232, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9433129 0.9657663
## sample estimates:
## cor
## 0.9559159
-> 상관계수 r = 0.9559159로 cty와hwy는 유의미하게 매우 높은 상관관계에 있다.
lm(data = mtcars, mpg~disp)
##
## Call:
## lm(formula = mpg ~ disp, data = mtcars)
##
## Coefficients:
## (Intercept) disp
## 29.59985 -0.04122
summary(lm(data = mtcars, mpg ~ disp))
##
## Call:
## lm(formula = mpg ~ disp, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.8922 -2.2022 -0.9631 1.6272 7.2305
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.599855 1.229720 24.070 < 2e-16 ***
## disp -0.041215 0.004712 -8.747 9.38e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.251 on 30 degrees of freedom
## Multiple R-squared: 0.7183, Adjusted R-squared: 0.709
## F-statistic: 76.51 on 1 and 30 DF, p-value: 9.38e-10
-> 회귀모형은 유의수준 p < 0.001에서 적합하며, 회귀식의 수정된 결정게수는 0.709이다. 배기량이 연비에 미치는 회귀계수는 유의수준 < 0.001에서 -0.04이다.
lm(data = mtcars, mpg~disp+hp+wt)
##
## Call:
## lm(formula = mpg ~ disp + hp + wt, data = mtcars)
##
## Coefficients:
## (Intercept) disp hp wt
## 37.105505 -0.000937 -0.031157 -3.800891
summary(lm(data = mtcars, mpg~disp+hp+wt))
##
## Call:
## lm(formula = mpg ~ disp + hp + wt, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.891 -1.640 -0.172 1.061 5.861
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.105505 2.110815 17.579 < 2e-16 ***
## disp -0.000937 0.010350 -0.091 0.92851
## hp -0.031157 0.011436 -2.724 0.01097 *
## wt -3.800891 1.066191 -3.565 0.00133 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.639 on 28 degrees of freedom
## Multiple R-squared: 0.8268, Adjusted R-squared: 0.8083
## F-statistic: 44.57 on 3 and 28 DF, p-value: 8.65e-11
-> 회귀모형은 유의수준 ㅔ < 0.001에서 적합하며, 회귀식의 수정된 걸정계수는 0.81이다.
-> 3개 독립변수가 연비에 미치는 회귀계수는 hp가 -0.03(p<0.05), wt가 -3.8(p<0.01)이었고 disp는 없었다. 즉, wt의 영향력이 가장 크다.
library(dplyr)
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
iris <- iris
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
iris1 <- filter(iris,Species%in%c("setosa","versicolor"))
t.test(data = iris1, Sepal.Length~Species)
##
## Welch Two Sample t-test
##
## data: Sepal.Length by Species
## t = -10.521, df = 86.538, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group setosa and group versicolor is not equal to 0
## 95 percent confidence interval:
## -1.1057074 -0.7542926
## sample estimates:
## mean in group setosa mean in group versicolor
## 5.006 5.936
library(ggplot2)
diamonds <- diamonds
chisq.test(diamonds$cut,diamonds$color)
##
## Pearson's Chi-squared test
##
## data: diamonds$cut and diamonds$color
## X-squared = 310.32, df = 24, p-value < 2.2e-16
round(prop.table(table(diamonds$cut, diamonds$color),1)*100,2)
##
## D E F G H I J
## Fair 10.12 13.91 19.38 19.50 18.82 10.87 7.39
## Good 13.49 19.02 18.53 17.75 14.31 10.64 6.26
## Very Good 12.52 19.86 17.91 19.03 15.10 9.97 5.61
## Premium 11.62 16.95 16.90 21.20 17.11 10.35 5.86
## Ideal 13.15 18.11 17.75 22.66 14.45 9.71 4.16
cor.test(diamonds$carat, diamonds$price)
##
## Pearson's product-moment correlation
##
## data: diamonds$carat and diamonds$price
## t = 551.41, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9203098 0.9228530
## sample estimates:
## cor
## 0.9215913
cars <- cars
summary(lm(data = cars, dist ~ speed))
##
## Call:
## lm(formula = dist ~ speed, data = cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.069 -9.525 -2.272 9.215 43.201
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.5791 6.7584 -2.601 0.0123 *
## speed 3.9324 0.4155 9.464 1.49e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.38 on 48 degrees of freedom
## Multiple R-squared: 0.6511, Adjusted R-squared: 0.6438
## F-statistic: 89.57 on 1 and 48 DF, p-value: 1.49e-12
summary(lm(data = diamonds, price ~ carat + depth))
##
## Call:
## lm(formula = price ~ carat + depth, data = diamonds)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18238.9 -801.6 -19.6 546.3 12683.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4045.333 286.205 14.13 <2e-16 ***
## carat 7765.141 14.009 554.28 <2e-16 ***
## depth -102.165 4.635 -22.04 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1542 on 53937 degrees of freedom
## Multiple R-squared: 0.8507, Adjusted R-squared: 0.8507
## F-statistic: 1.536e+05 on 2 and 53937 DF, p-value: < 2.2e-16