setwd("C:/Users/Administrator/Desktop/BIG DATA/R_Book")
facebook <- read.delim("FacebookNarcissism.dat", header= TRUE)
## ggplot
ggplot(data= facebook, aes(x=NPQC_R_Total, y= Rating, color=Rating_Type)) +
geom_smooth()+
geom_point()+
theme_classic()+
theme(plot.background = element_rect(fill="grey97")) -> p1
p1facebook %>%
ggplot(aes(NPQC_R_Total, Rating))+
geom_point(aes(colour = Rating_Type), position = "jitter")+
geom_smooth(aes(colour = Rating_Type), method=lm, se=F)+
labs(x= "Narcissim", y= "Facebook Picture Rating",
color="Rate Attribute") -> p2
p2학생들과 강사들의 평균 친구수를 보여주는 오차 막대 그래프
## # A tibble: 2 x 2
## job mean_friend
## <fct> <dbl>
## 1 lecturer 2.4
## 2 student 13.2
hchart(p3, "column", hcaes(x=job, mean_friend)) %>%
hc_tooltip(pointFormat = paste("Number: <b>{point.y}</b> <br>")) %>%
hc_xAxis(title= list(text= "Job")) %>%
hc_yAxis(plotLines = list(list(
color ="black", width = 1, dashStyle ="Dash",
value = mean(lecture$friends))))lecture %>%
ggplot(aes(job, friends))+
stat_summary(fun.y = mean,
geom= "bar",
fill= "#EE6AA7",
colours = "black")+
stat_summary(fun.data = mean_cl_normal,
geom ="errorbar",
color= "black",
width = 0.2)+
theme_economist()+
labs(x= "Job", y ="Number of Friends") -> p4
p4평균 알코올 소비량의 오차 막대표
lecture %>%
group_by(job) %>%
summarise(malcohol = mean(alcohol)) -> alchol_data
datatable(alchol_data)hchart(alchol_data, "column", hcaes(job, malcohol)) %>%
hc_tooltip(pointFormat = paste("Number: <b>{point.y}</b> <br>")) %>%
hc_xAxis(title= list(text= "Job")) %>%
hc_yAxis(title = list(text = "Alcohol Consumption"),
plotLines = list(list(color ="black", width = 1, dashStyle ="Dash", value = mean(lecture$alcohol))))lecture %>%
ggplot(aes(job, alcohol))+
stat_summary(fun.y = "mean",
geom="bar",
fill ="#EE6AA7",
color = "black")+
stat_summary(fun.data = mean_cl_normal,
geom = "errorbar",
color ="black",
width = 0.2)+
theme_economist()+
labs(x= "Jobs", y= "Mean Alcohol Consumption") -> p5
p5평균 수입 오차선 그래프
lecture %>%
group_by(job) %>%
summarise(mincome = mean(income)) -> income
hchart(income , "column", hcaes(job, mincome)) %>%
hc_yAxis(title = list(text= "Mean income"),
plotLines = list(list(color = "black",
dashStyple = "dash",
width = 1,
value = mean(lecture$income)))) %>%
hc_tooltip(pointFormat = paste("$: <b>{point.y}</b> <br>")) lecture %>%
ggplot(aes(job, income))+
stat_summary(fun.y = "mean", geom = "point")+
stat_summary(fun.y = mean, geom = "line", aes(group=1), color="Red")+
stat_summary(fun.y = mean, geom = "bar", fill ="#EE6AA7", color ="black", alpha= 0.7 )+
stat_summary(fun.data = mean_cl_normal, geom = "errorbar", width= 0.2)+
theme_economist() -> p6
p6평균 신경과민 정도
lecture %>%
group_by(job) %>%
summarise(neuro_mean = mean(neurotic)) -> mean_neuro
hchart(mean_neuro, "column", hcaes(job, neuro_mean)) %>%
hc_yAxis(title = list(text = "Mean Neuroticism"),
plotLines = list(list(color = "black",
dashStyle = "dash",
width =2,
value = mean(lecture$neurotic)))) %>%
hc_tooltip(pointFormat = paste("Level of Neuroticism: <b>{point.y}</b> <br>"))lecture %>%
ggplot(aes(job, neurotic)) +
stat_summary(fun.y = mean, geom = "bar", fill ="#EE6AA7", color ="black", alpha= 0.7)+
stat_summary(fun.y = mean, geom = "line", aes(group=1), color ="Red")+
stat_summary(fun.y = mean, geom = "point")+
stat_summary(fun.data = mean_cl_normal, geom = "errorbar", width= 0.2)+
theme_economist() -> p7
p7알코올 소비량과 신경과민 정도의 관계
highchart() %>%
hc_add_series(lecture, "scatter",hcaes(neurotic, alcohol))%>%
hc_add_series(lecture, "line",hcaes(neurotic, alcohol, group= job))ggplot(lecture, aes(neurotic, alcohol, color= job))+
geom_point(size=2)+
geom_smooth(method = "lm", aes(fill= job))+
theme_economist() -> p8
p8자신과 배우자에 대해 소비한 평균 총알 수의 오차 막대 그림
## Gender Partner Self id
## 1 Male 69 33 1
## 2 Male 76 26 2
## 3 Male 70 10 3
## 4 Male 76 51 4
## 5 Male 72 34 5
## 6 Male 65 28 6
## 7 Male 82 27 7
## 8 Male 71 9 8
## 9 Male 71 33 9
## 10 Male 75 11 10
## 11 Male 52 14 11
## 12 Male 34 46 12
## 13 Female 70 97 13
## 14 Female 74 80 14
## 15 Female 64 88 15
## 16 Female 43 100 16
## 17 Female 51 100 17
## 18 Female 93 58 18
## 19 Female 48 95 19
## 20 Female 51 83 20
## 21 Female 74 97 21
## 22 Female 73 89 22
## 23 Female 41 69 23
## 24 Female 84 82 24
# Reshaping
Bullets = reshape(data, idvar = c("id", "Gender"), varying= c("Partner", "Self"),
v.names = "Number_of_Bullets", timevar = "Recipient", time=c(0:1),
direction = "long")
Bullets$Recipient <- factor(Bullets$Recipient, labels = c("Partner","Self"))
Bullets %>%
ggplot(aes(Recipient, Number_of_Bullets, fill = Gender))+
stat_summary(fun.y = mean, geom = "bar", position = "dodge")+
stat_summary(fun.data = mean_cl_normal, geom = "errorbar", width= 0.2,
position = position_dodge(width=0.90))+
labs(y= "Number of Bullets")정규성
분산의 동질성
3.구간 자료
회귀에서 중요함
일반선형모형은 모형의 오차들이 정규분포를 따른다고 가정한다.
정규성 확인은? - Histogram & Density
dlf <- read.delim("DownloadFestival(No Outlier).dat", header = TRUE)
ggplot(dlf, aes(day1))+
geom_histogram(aes(y=..density..), color="indianred2", fill ="white")+
stat_function(fun = dnorm, args = list(mean = mean(dlf$day1, na.rm=TRUE),
sd= sd(dlf$day1, na.rm = TRUE)))분포도 알아보기
skewness 와 kurtosis 가 0 이여야 정규 분포이다. skew 가 + 이면, 왼쪽으로 기울어지고, - 이면 오른쪽으로 기울어진 분포를 가진다.
브릿존슨 : skew 가 0에 가깝고, kurtosis -.73 으로 조금 - 모멘토 : skew 가 0에 가깝고, kurtosis -1.26
## click$film: Bridget Jones' Diary
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 20 14.8 5.73 15 15.06 5.93 3 24 21 -0.32 -0.73 1.28
## ------------------------------------------------------------
## click$film: Memento
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 20 25.25 7.13 24.5 25.25 8.9 14 37 23 0.03 -1.26 1.59
분산 동질성 검토
H0 : 분산이 같다
H1 : 분산이 다르다.
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 1.8051 0.1871
## 38
F(1, 38) = 1.81, p > .05 으로, 두 집단의 분산이 다르다.
분산이 동일하지 않다.
정규성 검정을 위해 film 별로 나누어 실행
H0 : 정규 분포와 같다
H1 : 정규 분포와 다르다.
## click$film: Bridget Jones' Diary
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.97157, p-value = 0.7876
##
## ------------------------------------------------------------
## click$film: Memento
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.96039, p-value = 0.5516
시각화로 알아보기 Subset 함수로 해당 부분만 추출한다.
briget <- subset(click, click$film == "Bridget Jones' Diary")
Memento <- subset(click, click$film =="Memento")## # A tibble: 40 x 2
## # Groups: film [2]
## film arousal
## <fct> <int>
## 1 Bridget Jones' Diary 22
## 2 Bridget Jones' Diary 13
## 3 Bridget Jones' Diary 16
## 4 Bridget Jones' Diary 10
## 5 Bridget Jones' Diary 18
## 6 Bridget Jones' Diary 24
## 7 Bridget Jones' Diary 13
## 8 Bridget Jones' Diary 14
## 9 Bridget Jones' Diary 19
## 10 Bridget Jones' Diary 23
## # ... with 30 more rows
histo %>%
ggplot(aes(arousal, color= film, fill= film))+
stat_density(geom="line", alpha =0.2)+
geom_histogram(alpha =0.2, binwidth = 3)+
facet_wrap(~film)+
labs(y = "Arousal Distributions" , x= "Density",
title = "Distributions by Films")+
theme_economist()briget %>%
ggplot(aes(arousal))+
geom_histogram(aes(y=..density..), color="indianred2", fill ="white")+
stat_function(fun=dnorm, args = list(mean= mean(briget$arousal, na.rm = TRUE), sd = sd(briget$arousal, na.rm=TRUE)))Memento %>%
ggplot(aes(arousal))+
geom_histogram(aes(y=..density..), fill="white", color="black")+
stat_function(fun= dnorm, args = list(mean = mean(Memento$arousal, na.rm=TRUE),
sd = sd(Memento$arousal, na.rm = TRUE)),
color="red")이상치 처리하기
제거
자료 변환 : log, 제곱근, 역수, 뒤집기
점수 변경:
rexam <- read.table("rexam.dat", header = TRUE)
#checking histo
hchart(rexam$numeracy, name = "Numeracy Density")ggplot(rexam, aes(numeracy))+
geom_histogram(aes(y=..density..), color="black", fill="white", binwidth = 1.5)+
stat_function(fun = dnorm,
args = list(mean= mean(rexam$numeracy, na.rm= TRUE),
sd = sd(rexam$numeracy, na.rm= TRUE)), col="red")*1. 로그 변환
rexam$log_numeracy <- log(rexam$numeracy)
ggplot(rexam, aes(log_numeracy))+
geom_histogram(aes(y=..density..), color="black", fill="white", binwidth = 0.7)+
stat_function(fun = dnorm,
args = list(mean= mean(rexam$log_numeracy, na.rm= TRUE),
sd = sd(rexam$log_numeracy, na.rm= TRUE)), col="red")##
## Shapiro-Wilk normality test
##
## data: rexam$log_numeracy
## W = 0.95911, p-value = 0.003474
*2 제곱근
rexam$sqrtnemeracy <- sqrt(rexam$numeracy)
ggplot(rexam, aes(sqrtnemeracy))+
geom_histogram(aes(y=..density..), color="black", fill="white", binwidth = 0.3)+
stat_function(fun = dnorm,
args = list(mean= mean(rexam$sqrtnemeracy, na.rm= TRUE),
sd = sd(rexam$sqrtnemeracy, na.rm= TRUE)), col="red")##
## Shapiro-Wilk normality test
##
## data: rexam$sqrtnemeracy
## W = 0.96953, p-value = 0.02036
*3 역수 변환
rexam$recnumeracy <- 1/(rexam$numeracy)
ggplot(rexam, aes(recnumeracy)) +
geom_histogram(aes(y=..density..),
colour="black", fill="white", binwidth = .1) +
labs(x="Reciprocal of Numeracy Scores", y = "Density") +
stat_function(fun=dnorm,
args=list (mean=mean (rexam$recnumeracy, na.rm=TRUE),
sd=sd(rexam$recnumeracy, na.rm = TRUE)))##
## Shapiro-Wilk normality test
##
## data: rexam$recnumeracy
## W = 0.76329, p-value = 0.00000000002135