Basic Analysis


패키지 불러오기
# echo=TRUE : 코드를 평가하고 실행결과를 포함한다
# eval=TRUE : 실행결과와 함게 코드를 출력한다
# message=FALSE : 메시지를 출력한다
# warning=TRUE : 경고메시지를 출력한다 
# error=FALSE : 오류메시지를 출력한다 
# tidy=FALSE : 깔끔한 방식으로 코드 형태를 변형한다 

library(tidyverse)
library(jmv)
library(gridExtra)
library(corrplot)
library(RColorBrewer)
library(ggplot2)
library(gapminder)

상관관계
data(iris)
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
iris_1 <- iris[-5]
cor(iris_1)
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length    1.0000000  -0.1175698    0.8717538   0.8179411
## Sepal.Width    -0.1175698   1.0000000   -0.4284401  -0.3661259
## Petal.Length    0.8717538  -0.4284401    1.0000000   0.9628654
## Petal.Width     0.8179411  -0.3661259    0.9628654   1.0000000
corrplot(cor(iris_1), method=c("square"),type=c("full"),title="updragon")

***

파일 불러오기
getwd()
## [1] "C:/Users/user/Documents"
tyre <- read.csv("tyre.csv", header = T)

분산분석
head(tyre)
##   Brands Mileage
## 1 Apollo  32.998
## 2 Apollo  36.435
## 3 Apollo  32.777
## 4 Apollo  37.637
## 5 Apollo  36.304
## 6 Apollo  35.915
summary(tyre)
##          Brands      Mileage     
##  Apollo     :15   Min.   :27.88  
##  Bridgestone:15   1st Qu.:32.69  
##  CEAT       :15   Median :34.84  
##  Falken     :15   Mean   :34.74  
##                   3rd Qu.:36.77  
##                   Max.   :41.05
str(tyre)
## 'data.frame':    60 obs. of  2 variables:
##  $ Brands : Factor w/ 4 levels "Apollo","Bridgestone",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Mileage: num  33 36.4 32.8 37.6 36.3 ...
ggplot(tyre, aes(Brands, Mileage)) + geom_boxplot(aes(col=Brands)) 

boxplot.stats(tyre$Mileage[tyre$Brands=="CEAT"])     # boxplot explain
## $stats
## [1] 30.42748 33.11079 34.78336 36.12533 36.97277
## 
## $n
## [1] 15
## 
## $conf
## [1] 33.55356 36.01316
## 
## $out
## [1] 41.05
model1<- aov(Mileage~Brands, data=tyre)
summary(model1)
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## Brands       3  256.3   85.43   17.94 2.78e-08 ***
## Residuals   56  266.6    4.76                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(model1, conf.level = 0.95)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = Mileage ~ Brands, data = tyre)
## 
## $Brands
##                           diff        lwr       upr     p adj
## Bridgestone-Apollo -3.01900000 -5.1288190 -0.909181 0.0020527
## CEAT-Apollo        -0.03792661 -2.1477456  2.071892 0.9999608
## Falken-Apollo       2.82553333  0.7157143  4.935352 0.0043198
## CEAT-Bridgestone    2.98107339  0.8712544  5.090892 0.0023806
## Falken-Bridgestone  5.84453333  3.7347143  7.954352 0.0000000
## Falken-CEAT         2.86345994  0.7536409  4.973279 0.0037424
plot(TukeyHSD(model1, conf.level = 0.95), las=1,col = "red")          # cf) las=2

***

편리한 패키지
#library(jmv)
data("ToothGrowth")   
str(ToothGrowth)
## 'data.frame':    60 obs. of  3 variables:
##  $ len : num  4.2 11.5 7.3 5.8 6.4 10 11.2 11.2 5.2 7 ...
##  $ supp: Factor w/ 2 levels "OJ","VC": 2 2 2 2 2 2 2 2 2 2 ...
##  $ dose: num  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
summary(ToothGrowth)   
##       len        supp         dose      
##  Min.   : 4.20   OJ:30   Min.   :0.500  
##  1st Qu.:13.07   VC:30   1st Qu.:0.500  
##  Median :19.25           Median :1.000  
##  Mean   :18.81           Mean   :1.167  
##  3rd Qu.:25.27           3rd Qu.:2.000  
##  Max.   :33.90           Max.   :2.000
coplot(len ~ dose | supp, data = ToothGrowth, panel = panel.smooth,
       xlab = "ToothGrowth data: length vs dose, given type of supplement")

ttestOneS(ToothGrowth, vars = c('len', 'dose'))
## 
##  ONE SAMPLE T-TEST
## 
##  One Sample T-Test                                      
##  ────────────────────────────────────────────────────── 
##                           statistic    df      p        
##  ────────────────────────────────────────────────────── 
##    len     Student's t         19.1    59.0    < .001   
##    dose    Student's t         14.4    59.0    < .001   
##  ──────────────────────────────────────────────────────
ttestIS(data = ToothGrowth, vars = 'len', group = 'supp')
## 
##  INDEPENDENT SAMPLES T-TEST
## 
##  Independent Samples T-Test                           
##  ──────────────────────────────────────────────────── 
##                          statistic    df      p       
##  ──────────────────────────────────────────────────── 
##    len    Student's t         1.92    58.0    0.060   
##  ────────────────────────────────────────────────────
data('bugs', package = 'jmv')
ttestPS(bugs, pairs = list(list(i1 = 'LDLF', i2 = 'LDHF')))
## 
##  PAIRED SAMPLES T-TEST
## 
##  Paired Samples T-Test                                          
##  ────────────────────────────────────────────────────────────── 
##                                   statistic    df      p        
##  ────────────────────────────────────────────────────────────── 
##    LDLF    LDHF    Student's t        -6.65    90.0    < .001   
##  ──────────────────────────────────────────────────────────────
anova(ToothGrowth, dep="len", factors = c("dose", "supp"))        
## 
##  ANOVA
## 
##  ANOVA                                                                   
##  ─────────────────────────────────────────────────────────────────────── 
##                 Sum of Squares    df    Mean Square    F        p        
##  ─────────────────────────────────────────────────────────────────────── 
##    dose                   2426     2         1213.2    92.00    < .001   
##    supp                    205     1          205.4    15.57    < .001   
##    dose:supp               108     2           54.2     4.11     0.022   
##    Residuals               712    54           13.2                      
##  ───────────────────────────────────────────────────────────────────────

신뢰도분석_알파
data('iris')
a <- reliability(iris, vars = c('Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width'),omegaScale = TRUE)
a$scale
## 
##  Scale Reliability Statistics              
##  ───────────────────────────────────────── 
##             Cronbach's α    McDonald's ω   
##  ───────────────────────────────────────── 
##    scale           0.708           0.835   
##  ───────────────────────────────────────── 
##    Note. item 'Sepal.Width' correlates
##    negatively with the total scale and
##    probably should be reversed
a$items
## 
##  Item Reliability Statistics 
##  ─────────────────────────── 
##                   
##  ─────────────────────────── 
##    Sepal.Length   
##    Sepal.Width    
##    Petal.Length   
##    Petal.Width    
##  ───────────────────────────

탐색적 요인분석
efa(iris, vars = c('Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width'))
## The estimated weights for the factor scores are probably incorrect.  Try a different factor extraction method.
## 
##  EXPLORATORY FACTOR ANALYSIS
## 
##  Factor Loadings                                       
##  ───────────────────────────────────────────────────── 
##                    1        2        3    Uniqueness   
##  ───────────────────────────────────────────────────── 
##    Sepal.Length    0.996                     0.00547   
##    Sepal.Width              0.773            0.40932   
##    Petal.Length    0.904                     0.00500   
##    Petal.Width     0.933                     0.01502   
##  ───────────────────────────────────────────────────── 
##    Note. 'oblimin' rotation was used

회귀분석
data('Prestige', package='carData')
str(Prestige)
## 'data.frame':    102 obs. of  6 variables:
##  $ education: num  13.1 12.3 12.8 11.4 14.6 ...
##  $ income   : int  12351 25879 9271 8865 8403 11030 8258 14163 11377 11023 ...
##  $ women    : num  11.16 4.02 15.7 9.11 11.68 ...
##  $ prestige : num  68.8 69.1 63.4 56.8 73.5 77.6 72.6 78.1 73.1 68.8 ...
##  $ census   : int  1113 1130 1171 1175 2111 2113 2133 2141 2143 2153 ...
##  $ type     : Factor w/ 3 levels "bc","prof","wc": 2 2 2 2 2 2 2 2 2 2 ...
linReg(data = Prestige, dep = 'income',
       covs = c('education', 'prestige', 'women'),
       blocks = list(list('education', 'prestige', 'women')))
## 
##  LINEAR REGRESSION
## 
##  Model Fit Measures          
##  ─────────────────────────── 
##    Model    R        R²      
##  ─────────────────────────── 
##        1    0.802    0.643   
##  ─────────────────────────── 
## 
## 
##  MODEL SPECIFIC RESULTS
## 
##  MODEL 1
## 
##  Model Coefficients                                       
##  ──────────────────────────────────────────────────────── 
##    Predictor    Estimate    SE         t         p        
##  ──────────────────────────────────────────────────────── 
##    Intercept      -253.8    1086.16    -0.234     0.816   
##    education       177.2     187.63     0.944     0.347   
##    prestige        141.4      29.91     4.729    < .001   
##    women           -50.9       8.56    -5.948    < .001   
##  ────────────────────────────────────────────────────────
# summary(lm(income~education+prestige+women, data = Prestige))

시각화 예제 1
library(gapminder)
data(gapminder)
glimpse(gapminder)
## Observations: 1,704
## Variables: 6
## $ country   <fct> Afghanistan, Afghanistan, Afghanistan, Afghanistan, ...
## $ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia...
## $ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992...
## $ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.8...
## $ pop       <int> 8425333, 9240934, 10267083, 11537966, 13079460, 1488...
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 78...
hist(gapminder$lifeExp)

plot(density(gapminder$lifeExp))

gapminder %>% filter(year==2007) %>% ggplot(aes(gdpPercap,lifeExp, size=pop, col=continent))+geom_point() + scale_x_log10()

gapminder %>% ggplot(aes(year, lifeExp, group=country, col=continent))+geom_line(cex=.7)+facet_wrap(~continent)

***

시각화 예제 2
# Read data
url <- "http://www.mgs.md.gov/ReservoirDataPoints/PrettyBoy1998.dat"
prettyboy <- read.csv(url, skip = 2, header = FALSE)
names(prettyboy) <- read.csv(url, nrows = 1, header = FALSE, stringsAsFactors = FALSE)

head(prettyboy)
##    Easting Northing Depth
## 1 352606.8  4386507   9.7
## 2 352606.8  4386507   8.8
## 3 352606.5  4386507  11.4
## 4 352605.9  4386509  15.8
## 5 352605.7  4386510  14.1
## 6 352658.8  4386512   7.9
glimpse(prettyboy)
## Observations: 54,680
## Variables: 3
## $ Easting  <dbl> 352606.8, 352606.8, 352606.5, 352605.9, 352605.7, 352...
## $ Northing <int> 4386507, 4386507, 4386507, 4386509, 4386510, 4386512,...
## $ Depth    <dbl> 9.7, 8.8, 11.4, 15.8, 14.1, 7.9, 7.8, 19.2, 8.4, 8.7,...
boxplot(prettyboy$Depth)

# Remove extremes, duplicates and Anomaly
ext <- c(which(prettyboy$Easting == min(prettyboy$Easting)), 
         which(prettyboy$Easting == max(prettyboy$Easting)),
         which(duplicated(prettyboy)))
prettyboy <- prettyboy[-ext, ]

# Visualise reservoir
bathymetry_colours <- c(rev(brewer.pal(3, "Greens"))[-2:-3], 
                        brewer.pal(9, "Blues")[-1:-3])
ggplot(prettyboy, aes(x = Easting, y = Northing, colour = Depth)) + 
  geom_point(size = .1) + coord_equal() + 
  scale_colour_gradientn(colors = bathymetry_colours) 

***