Exploring data with graphs

setwd("C:/Users/Administrator/Desktop/BIG DATA/R_Book")

facebook <- read.delim("FacebookNarcissism.dat", header= TRUE)

## ggplot 
ggplot(data= facebook, aes(x=NPQC_R_Total, y= Rating, color=Rating_Type)) +
  geom_smooth()+
  geom_point()+
  theme_classic()+
  theme(plot.background = element_rect(fill="grey97")) -> p1

p1

facebook %>% 
  ggplot(aes(NPQC_R_Total, Rating))+
  geom_point(aes(colour = Rating_Type), position = "jitter")+
  geom_smooth(aes(colour = Rating_Type), method=lm, se=F)+
  labs(x= "Narcissim", y= "Facebook Picture Rating",
       color="Rate Attribute") -> p2

p2

hchart(facebook, "point", hcaes(x=NPQC_R_Total,
                                  y=Rating,
                                  group=Rating_Type))

lecture <- read.delim("Lecturer Data.dat", header = TRUE)

datatable(lecture)

lecture$job <- factor(lecture$job, levels = c(1:2),
                      labels = c("lecturer", "student"))

학생들과 강사들의 평균 친구수를 보여주는 오차 막대 그래프

lecture %>% 
  group_by(job) %>% 
  summarise(mean_friend = mean(friends)) -> p3

p3

## # A tibble: 2 x 2
##   job      mean_friend
##   <fct>          <dbl>
## 1 lecturer         2.4
## 2 student         13.2

hchart(p3, "column", hcaes(x=job, mean_friend)) %>%
  hc_tooltip(pointFormat = paste("Number: <b>{point.y}</b> <br>")) %>% 
  hc_xAxis(title= list(text= "Job")) %>% 
  hc_yAxis(plotLines = list(list(
    color ="black", width = 1, dashStyle ="Dash",
    value = mean(lecture$friends))))

lecture %>% 
  ggplot(aes(job, friends))+
  stat_summary(fun.y = mean,
               geom= "bar",
               fill= "#EE6AA7",
               colours = "black")+
  stat_summary(fun.data = mean_cl_normal,
               geom ="errorbar",
               color= "black",
               width = 0.2)+
  theme_economist()+
  labs(x= "Job", y ="Number of Friends") -> p4

p4

평균 알코올 소비량의 오차 막대표

lecture %>% 
  group_by(job) %>% 
  summarise(malcohol = mean(alcohol)) -> alchol_data

datatable(alchol_data)

hchart(alchol_data, "column", hcaes(job, malcohol)) %>% 
  hc_tooltip(pointFormat = paste("Number: <b>{point.y}</b> <br>")) %>% 
  hc_xAxis(title= list(text= "Job")) %>% 
  hc_yAxis(title = list(text = "Alcohol Consumption"),
                        plotLines = list(list(color ="black", width = 1, dashStyle ="Dash", value = mean(lecture$alcohol))))

lecture %>% 
  ggplot(aes(job, alcohol))+
  
  stat_summary(fun.y = "mean", 
               geom="bar", 
               fill ="#EE6AA7",
               color = "black")+
  stat_summary(fun.data = mean_cl_normal,
               geom = "errorbar",
               color ="black",
               width = 0.2)+
  theme_economist()+
  labs(x= "Jobs", y= "Mean Alcohol Consumption") -> p5

p5

평균 수입 오차선 그래프

lecture %>% 
  group_by(job) %>% 
  summarise(mincome = mean(income)) -> income 


hchart(income , "column", hcaes(job, mincome)) %>% 
  hc_yAxis(title = list(text= "Mean income"),
           plotLines  = list(list(color = "black",
                                  dashStyple = "dash",
                                  width = 1,
                                  value = mean(lecture$income)))) %>% 
  hc_tooltip(pointFormat = paste("$: <b>{point.y}</b> <br>"))

lecture %>% 
  ggplot(aes(job, income))+
  stat_summary(fun.y =  "mean", geom = "point")+
  stat_summary(fun.y = mean, geom = "line", aes(group=1), color="Red")+
  stat_summary(fun.y = mean, geom = "bar",  fill ="#EE6AA7", color ="black", alpha= 0.7 )+
  stat_summary(fun.data = mean_cl_normal, geom = "errorbar", width= 0.2)+
  theme_economist() -> p6

p6

평균 신경과민 정도

lecture %>% 
  group_by(job) %>% 
  summarise(neuro_mean = mean(neurotic)) -> mean_neuro


hchart(mean_neuro, "column", hcaes(job, neuro_mean)) %>% 
  hc_yAxis(title = list(text = "Mean Neuroticism"),
           plotLines  = list(list(color = "black",
                                  dashStyle = "dash",
                                  width =2,
                                  value = mean(lecture$neurotic)))) %>% 
  hc_tooltip(pointFormat = paste("Level of Neuroticism: <b>{point.y}</b> <br>"))

lecture %>% 
  ggplot(aes(job, neurotic)) + 
  stat_summary(fun.y = mean, geom = "bar", fill ="#EE6AA7", color ="black", alpha= 0.7)+
  stat_summary(fun.y = mean, geom = "line", aes(group=1), color ="Red")+
  stat_summary(fun.y = mean, geom = "point")+
  stat_summary(fun.data = mean_cl_normal, geom = "errorbar", width= 0.2)+
  theme_economist() -> p7

p7

알코올 소비량과 신경과민 정도의 관계

highchart() %>% 
  hc_add_series(lecture, "scatter",hcaes(neurotic, alcohol))%>%
  hc_add_series(lecture, "line",hcaes(neurotic, alcohol, group= job))

ggplot(lecture, aes(neurotic, alcohol, color= job))+
  geom_point(size=2)+
  geom_smooth(method = "lm", aes(fill= job))+
  theme_economist() -> p8


p8

자신과 배우자에 대해 소비한 평균 총알 수의 오차 막대 그림

data  <- read.csv("Infidelity.csv", header = TRUE)
data$X <-NULL

data$id <- row(data[1])

data

##    Gender Partner Self id
## 1    Male      69   33  1
## 2    Male      76   26  2
## 3    Male      70   10  3
## 4    Male      76   51  4
## 5    Male      72   34  5
## 6    Male      65   28  6
## 7    Male      82   27  7
## 8    Male      71    9  8
## 9    Male      71   33  9
## 10   Male      75   11 10
## 11   Male      52   14 11
## 12   Male      34   46 12
## 13 Female      70   97 13
## 14 Female      74   80 14
## 15 Female      64   88 15
## 16 Female      43  100 16
## 17 Female      51  100 17
## 18 Female      93   58 18
## 19 Female      48   95 19
## 20 Female      51   83 20
## 21 Female      74   97 21
## 22 Female      73   89 22
## 23 Female      41   69 23
## 24 Female      84   82 24

# Reshaping 

Bullets = reshape(data, idvar = c("id", "Gender"), varying= c("Partner", "Self"),
                  v.names = "Number_of_Bullets", timevar = "Recipient", time=c(0:1),
                  direction = "long")

Bullets$Recipient <- factor(Bullets$Recipient, labels = c("Partner","Self"))


Bullets %>% 
  ggplot(aes(Recipient, Number_of_Bullets, fill = Gender))+
  stat_summary(fun.y = mean, geom = "bar", position = "dodge")+
  stat_summary(fun.data = mean_cl_normal, geom = "errorbar", width= 0.2, 
               position = position_dodge(width=0.90))+
  labs(y= "Number of Bullets")

Exploring Assumptions

정규성
분산의 동질성

3.구간 자료

독립성 : 자료의 독립성

정규성

회귀에서 중요함

일반선형모형은 모형의 오차들이 정규분포를 따른다고 가정한다.

정규성 확인은? - Histogram & Density

dlf <- read.delim("DownloadFestival(No Outlier).dat", header = TRUE)

ggplot(dlf, aes(day1))+
  geom_histogram(aes(y=..density..), color="indianred2", fill ="white")+
  stat_function(fun = dnorm, args = list(mean = mean(dlf$day1, na.rm=TRUE),
                                         sd= sd(dlf$day1, na.rm = TRUE)))

  ## stat_function 으로 정규곡선을 그린다. 


ggplot(dlf, aes(sample = dlf$day1))+stat_qq()

hchart(dlf$day1, name= "Day1")

분포도 알아보기

skewness 와 kurtosis 가 0 이여야 정규 분포이다. skew 가 + 이면, 왼쪽으로 기울어지고, - 이면 오른쪽으로 기울어진 분포를 가진다.

브릿존슨 : skew 가 0에 가깝고, kurtosis -.73 으로 조금 - 모멘토 : skew 가 0에 가깝고, kurtosis -1.26

click <- read.delim("ChickFlick.dat", header = TRUE)


by(click$arousal, click$film, describe)

## click$film: Bridget Jones' Diary
##    vars  n mean   sd median trimmed  mad min max range  skew kurtosis   se
## X1    1 20 14.8 5.73     15   15.06 5.93   3  24    21 -0.32    -0.73 1.28
## ------------------------------------------------------------ 
## click$film: Memento
##    vars  n  mean   sd median trimmed mad min max range skew kurtosis   se
## X1    1 20 25.25 7.13   24.5   25.25 8.9  14  37    23 0.03    -1.26 1.59

분산 동질성 검토

H0 : 분산이 같다

H1 : 분산이 다르다.

leveneTest(click$arousal, click$film, center =median)

## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  1  1.8051 0.1871
##       38

F(1, 38) = 1.81, p > .05 으로, 두 집단의 분산이 다르다.

분산이 동일하지 않다.

정규성 검정을 위해 film 별로 나누어 실행

H0 : 정규 분포와 같다

H1 : 정규 분포와 다르다.

by(data= click$arousal, INDICES = click$film, FUN = shapiro.test)

## click$film: Bridget Jones' Diary
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.97157, p-value = 0.7876
## 
## ------------------------------------------------------------ 
## click$film: Memento
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.96039, p-value = 0.5516

시각화로 알아보기 Subset 함수로 해당 부분만 추출한다.

briget <- subset(click, click$film == "Bridget Jones' Diary")
Memento <- subset(click, click$film =="Memento")

click %>% 
  select(film, arousal) %>% 
  group_by(film) -> histo

histo

## # A tibble: 40 x 2
## # Groups:   film [2]
##    film                 arousal
##    <fct>                  <int>
##  1 Bridget Jones' Diary      22
##  2 Bridget Jones' Diary      13
##  3 Bridget Jones' Diary      16
##  4 Bridget Jones' Diary      10
##  5 Bridget Jones' Diary      18
##  6 Bridget Jones' Diary      24
##  7 Bridget Jones' Diary      13
##  8 Bridget Jones' Diary      14
##  9 Bridget Jones' Diary      19
## 10 Bridget Jones' Diary      23
## # ... with 30 more rows

histo %>% 
  ggplot(aes(arousal, color= film, fill= film))+
  stat_density(geom="line", alpha =0.2)+
  geom_histogram(alpha =0.2, binwidth = 3)+
  facet_wrap(~film)+
  labs(y = "Arousal Distributions" , x= "Density",
       title = "Distributions by Films")+
  theme_economist()

briget %>% 
  ggplot(aes(arousal))+
   geom_histogram(aes(y=..density..), color="indianred2", fill ="white")+
  stat_function(fun=dnorm, args = list(mean= mean(briget$arousal, na.rm = TRUE), sd = sd(briget$arousal, na.rm=TRUE)))

Memento %>% 
  ggplot(aes(arousal))+
  geom_histogram(aes(y=..density..), fill="white", color="black")+
  stat_function(fun= dnorm, args = list(mean = mean(Memento$arousal, na.rm=TRUE),
                                        sd = sd(Memento$arousal, na.rm = TRUE)),
                color="red")

이상치 처리하기

제거
자료 변환 : log, 제곱근, 역수, 뒤집기
점수 변경:

rexam <- read.table("rexam.dat", header = TRUE)

#checking histo 

hchart(rexam$numeracy, name = "Numeracy Density")

ggplot(rexam, aes(numeracy))+
  geom_histogram(aes(y=..density..), color="black", fill="white", binwidth = 1.5)+
  stat_function(fun = dnorm, 
                args = list(mean= mean(rexam$numeracy, na.rm= TRUE),
                            sd = sd(rexam$numeracy, na.rm= TRUE)), col="red")

*1. 로그 변환

rexam$log_numeracy <- log(rexam$numeracy)


ggplot(rexam, aes(log_numeracy))+
  geom_histogram(aes(y=..density..), color="black", fill="white", binwidth = 0.7)+
  stat_function(fun = dnorm, 
                args = list(mean= mean(rexam$log_numeracy, na.rm= TRUE),
                            sd = sd(rexam$log_numeracy, na.rm= TRUE)), col="red")

shapiro.test(rexam$log_numeracy)

## 
##  Shapiro-Wilk normality test
## 
## data:  rexam$log_numeracy
## W = 0.95911, p-value = 0.003474

*2 제곱근

rexam$sqrtnemeracy <- sqrt(rexam$numeracy)


ggplot(rexam, aes(sqrtnemeracy))+
  geom_histogram(aes(y=..density..), color="black", fill="white", binwidth = 0.3)+
  stat_function(fun = dnorm, 
                args = list(mean= mean(rexam$sqrtnemeracy, na.rm= TRUE),
                            sd = sd(rexam$sqrtnemeracy, na.rm= TRUE)), col="red")

shapiro.test(rexam$sqrtnemeracy)

## 
##  Shapiro-Wilk normality test
## 
## data:  rexam$sqrtnemeracy
## W = 0.96953, p-value = 0.02036

*3 역수 변환

rexam$recnumeracy <- 1/(rexam$numeracy)

ggplot(rexam, aes(recnumeracy)) +
 geom_histogram(aes(y=..density..),
 colour="black", fill="white", binwidth = .1) +
 labs(x="Reciprocal of Numeracy Scores", y = "Density") +
 stat_function(fun=dnorm,
               args=list (mean=mean (rexam$recnumeracy, na.rm=TRUE),
                           sd=sd(rexam$recnumeracy, na.rm = TRUE)))

shapiro.test(rexam$recnumeracy)

## 
##  Shapiro-Wilk normality test
## 
## data:  rexam$recnumeracy
## W = 0.76329, p-value = 0.00000000002135

Statistics Chapter - 1

DOEUN

2020 3 18

Exploring data with graphs

Exploring Assumptions

정규성