student = c(‘A’, ‘유신’, ‘C’, ‘D’, ‘E’) subject = c(‘kor’, ‘eng’, ‘math’, ‘sci’, ‘soc’) score = c(75,80,92,77,90) subject_mean = c(68.4,75.1,63.5,76.2,80.3) subject_sd = c(4.9,11.2,9.8,5.2,8.5) ##### 확률 면적 상위 20% 지점 점수 x <- qnorm( (1 - 0.2), mean = 68.4, sd = 4.9 ) ?qnorm var.score <- data.frame( 과목 = subject <- c(‘kor’, ‘eng’, ‘math’, ‘sci’, ‘soc’), 점수 = score <- c(75,80,92,77,90), 과목별평균 = subject_mean <- c(68.4,75.1,63.5,76.2,80.3), 표준편차 = subject_sd <- c(4.9,11.2,9.8,5.2,8.5), stringAsFactors = F # 과목명이 factor로 변화하는 것을 막는다. )
var.score[, ‘상위20’]<- round(qnorm( (1 - 0.2), mean = var.score\(과목별평균, sd = var.score\)표준편차 ),2) ###### 상위 20% 점수를 구해서 컬럼을 추가하기. ###### 상위 20% 패스여부 확인 var.score[,‘패스’]= “실패” var.score[var.score\(점수 > var.score\)상위20, ‘패스’]=“성공” var.score <- subset(var.score, select = -c(stringAsFactors)) View(var.score) yusin <- round(runif(5,40,100),0)
X <- c(1:5) X plot(X) plot(c(1:5)) plot(c(2,2,2,2,2)) # 하나면 x는 1씩 증가, y는 벡터의 값 plot(1:3,3:1) # 두개이면 앞은 x고 뒤는 y입니다. plot(1:3,3:1,xlim=c(1,10),ylim=c(1,5))
plot(1:3,3:1, xlim=c(1,10), ylim=c(1,5), xlab=‘x축’, ylab=‘y축’, main=‘테스트’) ###### type = “p” 점 (디폴트)
target <- round(runif(5,100,150),0)
plot( target, type = “o”, col = ‘red’, ylim =c(0,200), axes = F, #hide x, y 축 ann = F # X,Y축에 NO TITLE )
axis( 1, #x축 at=1:5, lab = c(‘월’, ‘화’, ‘수’, ‘목’, ‘금’) )
axis( 2, #y축 at=1:5 )
title( main = ‘과일’, col.main = ‘red’, font.main = 4 )
title( xlab=‘요일’, col.lab = ‘black’)
title( ylab=‘가격’, col.lab = ‘blue’)
target <- round(runif(5,100,150),0) par(mflow=c(1,3)) plot(target,type=‘o’) #선+점 plot(target,type=‘s’) #계단식 plot(target,type=‘l’) #꺾은선
pie(target) plot(target,type=‘o’) barplot(target)
par(mflow=c(1,1)) t1 <- c(1,2,3,4,5) t2 <- c(5,4,3,2,1) t3 <- c(3,4,5,6,7) plot( t1, type=‘s’, col=‘red’, ylim =c(1,5)) par(new=T)
plot( t2, type=‘o’, col=‘blue’, ylim =c(1,5)) par(new=T) plot( t3, type=‘l’, col=‘black’)
par(mflow=c(1,1)) t1 <- c(1,2,3,4,5) t2 <- c(5,4,3,2,1) t3 <- c(3,4,5,6,7) plot( t1, type=‘s’, col=‘red’, ylim =c(1,10)) par(new=T) ###### 중복허용 lines( t2, type=‘o’, col=‘blue’, ylim =c(1,5)) par(new=T) lines( t2, type=‘l’, col=‘green’, ylim=c(1,15))
legend( 4, #x축의 위치 9, #y축의 위치 c(“샘플1”, “샘플2”, “샘플3”), cex=0.9, #글자크기 col=c(‘red’,‘blue’,‘green’), lty=1)
barplot(1:5)
barplot(1:5,horiz=T)
banana <- round(runif(5,100,150),0) cherry <- round(runif(5,100,150),0) orange <- round(runif(5,100,150),0) fruit <- data.frame( 바나나 = banana, 체리 = cherry, 오렌지 = orange)
day <- c(‘월’,‘화’,‘수’,‘목’,‘금’) barplot( as.matrix(fruit), ### 반드시 df -> matix로 전환해서 그려야함 main = ‘과일판매량’, beside = T, col = rainbow(nrow(fruit)), ylim = c(0,400) )
legend( 14,400,day,cex=0.8, fill=rainbow(nrow(fruit)) )
banana <- round(runif(5,100,150),0) cherry <- round(runif(5,100,150),0) orange <- round(runif(5,100,150),0) fruit <- data.frame( 바나나 = banana, 체리 = cherry, 오렌지 = orange) day <- c(‘월’,‘화’,‘수’,‘목’,‘금’) barplot( t(fruit), #transform 행과 열을 바꿈 main = ‘과일판매량’, col = rainbow(nrow(fruit)), ylim = c(0,400), space = 0.1, cex.axis = 0.8, las = 1, names.arg = day, cex = 0.8)
orange <- round(runif(5,150,200),0) day <- c(‘월’,‘화’,‘수’,‘목’,‘금’) color <- c()
for(i in c(1:5)) {if(orange[i]>=180){ color <- c(color,‘red’) }else if(orange[i]>=150){ color <- c(color,‘yellow’) }else{ color <- c(color,‘green’)} barplot( orange, main = ‘오렌지판매량’, col = color, names.arg = day)
install.packages(ggplot2) library(ggplot2) if(“ggplot2” %in% installed.packages(“ggplot2”) == FALSE)install.packages(“ggplot2”) library(ggplot2) ggplot( data = mpg, aes(x = displ, y = hwy) )+geom_point()+xlim(3,6)+ylim(10,30)
cancer <- read.csv(“https://www.dropbox.com/s/dw59m4q1vaqwayl/example_cancer.csv?dl=0”) head(cancer) class(cancer) str(cancer)
degree_of_age <- table(cut(cancer$age,breaks=(1:8)*10))
head(degree_of_age) rownames(degree_of_age) <- c(‘10대’,‘20대’,‘30대’,‘40대’,‘50대’,‘60대’,‘70대’) #STEP5 ggplot(data = cancer, aes(x=age))+geom_freqpoly( binwidth=10,size=1.4,color=‘red’)
csv = “https://www.dropbox.com/s/9gchq4nbt67lpxu/example_salary.csv?dl=0” salary <- read.csv(csv, stringsAsFactors = F, na = “-”) salary <- example_salary str(salary) colnames(salary) colnames(salary) <- c(“age”,“wage”,“specialwage”,“workingtime”,“workercount”, “carrer”,“gender”) #salary$age를 하지 않도록 조치 #step4 검색 목록에 올리기 #salary dataframe을 디폴트값으로 지정
attach(salary) #step 기술통계, 평균/중앙값,최빈값 mean(salary\(age) wage_mean <- mean(salary\)wage, na.rm = T) wage_mean #중앙값 구하기, wage_mid wage_mid <- median(salary\(wage, na.rm = T) wage_mid #범위 구하기 wage_range <- range(salary\)wage, na.rm = T) wage_range <- range(salary$wage, na.rm = T) wage_range
highest_wage <- which(salary$wage == 4064286) highest_wage salary[highest_wage,]
qnt <- quantile(salary$wage, na.rm=T) qnt
sal_list <- list( 평균월급 = wage_mean, 월급중앙값 = wage_mid, 월급범위 = wage_range, 월급사분위 = qnt) sal_list
wage_avg_per_gender <- tapply( salary\(wage, salary\)gender,mean, na.rm = T)
wage_avg_per_gender ?tapply x <- tapply(salary\(wage, salary\)workingtime, mean, na.rm = T) x # reshape package installation, library install.packages(“reshape”) library(reshape) temp <- melt(wage_avg_per_gender) ggplot( data = temp, aes( x = Var1, y = value, fill = Var1 ) )+geom_bar( stat = “identity” )
salary\(carrer wage_avg_per_carrer <- tapply(salary\)wage, salary$carrer, mean, na.rm = T) wage_avg_per_carrer temp <- melt(wage_avg_per_carrer) ggplot( data = temp, aes( x = indices, y = value, fill = indices ) )+geom_bar( stat = “identity” )
melt <- melt(wage_avg_per_carrer)
ggplot( melt, aes( x = indices, y = value, group = 1 ) )+geom_line( color = ‘blue’, size = 2 )+ coord_polar()+ ylim(0,max(melt$value))
melt\(indices # 데이터 정규화는 변숫값의 분포를 표준화하는 것을 의미한다. # 표준화는 변수에서 데이터의 평균을 빼거나 # 변수를 전체 데이터의 표준편차로 나누는 작업을 포함한다. # 이렇게 하면 변숫값의 평균이 0이 되고 값의 # 퍼짐정도(분포) 또한 일정해진다. # R에서 데이터를 정규화 하는 함수는 scale()이다. #각 경력별로 제일 적게 받는 월급의 집단 tapply( salary\)wage, salary$carrer, range, na.rm = T ) library(ggplot2)
wage_scale <- scale(salary\(wage) head(wage_scale,10) salary <- cbind(salary,scale = wage_scale) str(salary) g1 <- ggplot(salary,aes(x=salary\)scale,y=salary\(age)) g1 g2 <- geom_segment(aes(yend=salary\)age),xend=0) g3 <- g1+g2+geom_point( size=7, aes(color=salary\(gender,shape=salary\)carrer) )+theme_minimal() g3
t <- c(1,2,3,4,5) t mean(t) sd(t) scale(t)
csv = “https://www.dropbox.com/s/9gchq4nbt67lpxu/example_salary.csv?dl=1” salary <- read.csv(csv, stringsAsFactor = F, na = “-”) # step 2 str(salary) colnames(salary) # [1] “연령” # [2] “월급여액..원.” # [3] “연간특별급여액..원.” # [4] “근로시간..시간.” # [5] “근로자수..명.” # [6] “경력구분” # [7] “성별” # step 3 연산을 하기 위해 한글명을 영어로 변환 colnames(salary) <- c( “age”,“wage”,“special_wage”,“working_time”,“worker_count”,“career”,“gender” ) colnames(salary) #salary\(wage # step 4 : 검색목록에 올리기. # salary\)age 를 하지 않도록 조치 # salary dataframe 을 디폴트값으로 지정 # detach(salary) # attach(salary) # step 5 :기술통계 :: 평균, 중앙값, 최빈값 # mean, median, mode salary\(wage wage_mean <- mean(salary\)wage, na.rm = T) wage_mean # [1] 2171578 # 중앙값 median wage_mid <- median(salary\(wage, na.rm = T) wage_mid # 범위 구하기 wage_range <- range(salary\)wage, na.rm = T) wage_range # 1117605 4064286 # 최고임금을 받는 사람의 정보 highest_wage <- which(salary\(wage == 4064286) salary[highest_wage,] # 4분위 구하기 qnt <- quantile(salary\)wage,na.rm=T) qnt # step 6 리스트에 담기 sal_list <- list( 평균월급 = wage_mean, 월급중앙값 = wage_mid, 월급범위 = wage_range, 월급사분위 = qnt ) sal_list # 성별에 따른 임금격차 wage_avg_per_gender <- tapply( salary\(wage,salary\)gender,mean,na.rm=T ) wage_avg_per_gender # 남 여 # 2477332 1865823 # reshape2 install.packages(“reshape2”) library(reshape2) temp <- melt(wage_avg_per_gender) temp ggplot( data = temp, aes( x = Var1, # melt에 내장된 x 값 y = value, fill = Var1 ) )+geom_bar( stat = “identity” ) # 커리어에 따른 임금격차 # salary\(career wage_avg_per_career <- tapply( salary\)wage,salary\(career,mean,na.rm=T ) wage_avg_per_career temp <- melt(wage_avg_per_career) temp ggplot( data = temp, aes( x = Var1, # melt에 내장된 x 값 y = value, fill = Var1 ) )+geom_bar( stat = "identity" ) melt <- melt(wage_avg_per_career) ggplot( melt, aes( x = Var1, y = value, group = 1 ) )+geom_line( colr = 'blue', size = 2 )+ coord_polar()+ ylim(0,max(melt\)value))
tapply( salary\(wage, salary\)career, range, na.rm = T ) # $1~3년미만 # [1] 1172399 2619221 # # $10년이상 # [1] 1685204 4064286 # # $1년미만 # [1] 1117605 2414345 # # $3~5년미만 # [1] 1245540 2827420 # # \(`5~10년미만` # [1] 1548036 3309231 year_1 <- salary[which(salary\)wage == 1117605),] year_1_3 <- salary[which(salary$wage == 1172399),] year_3_5 <- salary[which(salary$wage == 1245540),] year_5_10 <- salary[which(salary$wage == 1548036),] year_10 <- salary[which(salary$wage == 1685204),]
career_list <- list( year_1,year_1_3,year_3_5,year_5_10,year_10 ) career_list
wage_scale <- scale(salary\(wage) head(wage_scale, 10) # [,1] # [1,] -1.28886999 # [2,] -0.91757018 # [3,] -0.38981924 # [4,] -0.06340878 # [5,] 0.37924689 # [6,] 0.31343053 # [7,] 0.28505815 # [8,] -0.04016661 # [9,] -0.13812959 # [10,] -0.78222571 ## 평균이 0이고, 0을 기준으로 분산된 값들이 있다 salary <- cbind(salary,scale = wage_scale) str(salary) g1 <- ggplot(salary,aes(x=salary\)scale,y=salary\(age)) g2 <- geom_segment(aes(yend=salary\)age),xend=0) g3 <- g1 + g2 + geom_point( size = 7, aes(color=salary\(gender,shape=salary\)career) )+theme_minimal()
g3 ## 해석 # 10년이상된 45~54세 남성이 가장 고소득자. # 25 ~ 29세 그룹은 격차가 크지 않다 # 45세 이상부터는 그룹간 격차가 크다 # 저임금은 주로 여성그룹에서 나타난다 # 고임금은 주로 남성그룹에서 나타난다