R(4-2)

Week 2

변수 만들기

a <- 1
b <- 2
c <- 3
d <- 3.5
var1 <- c(1, 2, 5, 7, 8)
var2 <- c(1:5)
var3 <- seq(1,10, by = 2)
var4 <- seq(10,100, by = 7)
class <- c("사과", "배", "오렌지", "자두", "귤", "바나나")
str4 <- c("a", "b", "c")
str5 <- c("Hello", "world", "is", "good!")

문자를 다루는 함수 이용하기

paste(str5, collapse = " ")

## [1] "Hello world is good!"

ggplot2 패키지 설치 후 함수 사용하기

install.packages("ggplot2")

library(ggplot2)
x <- c("a", "a", "b", "c")
qplot(x)

qplot(data = mpg, x = hwy) #hwy: 연비

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qplot(data = mpg, x = drv, y = hwy, geom = "boxplot", col=drv)

qplot(mpg, wt, data = mtcars)

qplot(mpg, wt, data = mtcars, colour = cyl)

qplot(mpg, wt, data = mtcars, size = cyl)

qplot(mpg, wt, data = mtcars, facets = vs ~ am)

## Week 3

데이터 프레임 만들기

english <- c(90, 80, 70, 60)
math <- c(50, 60, 100, 20)
class<- c(1, 1, 2, 2)
df_midterm <- data.frame(english = c(90, 80, 70, 60), 
                         math = c(50, 60, 100, 20), 
                         class = c(1, 1, 2, 2))
mean(df_midterm$english)

## [1] 75

mean(df_midterm$math)

## [1] 57.5

df_fruit<- data.frame(goods = c("사과", "딸기", "수박"),
                      price = c(1800, 1500, 3000),
                      sell = c(24, 38, 13))
mean(df_fruit$price)

## [1] 2100

mean(df_fruit$sell)

## [1] 25

엑셀 파일 불러오기

install.packages("readxl")

library(readxl)

setwd("C:\\Users\\user\\Desktop\\R")

df_exam <- read_excel("excel_exam.xlsx")
mean(df_exam$math)

## [1] 57.45

mean(df_exam$english)

## [1] 84.9

mean(df_exam$science)

## [1] 59.45

df_exam_novar <- read_excel("excel_exam_novar.xlsx", col_names = F) #첫번째 행이 변수명이 아닌 경우

## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5

df_exam_sheet <- read_excel("excel_exam_sheet.xlsx", sheet = 3) #시트가 여러 개일 경우

csv, rds 파일 활용하기

setwd("C:\\Users\\user\\Desktop\\R")

df_csv_exam <- read.csv("csv_exam.csv", stringsAsFactors = F)
str(df_csv_exam)

## 'data.frame':    20 obs. of  5 variables:
##  $ id     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ class  : int  1 1 1 1 2 2 2 2 3 3 ...
##  $ math   : int  50 60 45 30 25 50 80 90 20 50 ...
##  $ english: int  98 97 86 98 80 89 90 78 98 98 ...
##  $ science: int  50 60 78 58 65 98 45 25 15 45 ...

write.csv(df_midterm, file = "df_midterm.csv")

saveRDS(df_midterm, file = "df_midterm.rds")
rm(df_midterm)  
readRDS("df_midterm.rds") #R에서 지워도 복구 가능

##   english math class
## 1      90   50     1
## 2      80   60     1
## 3      70  100     2
## 4      60   20     2

Week 4

데이터 파악하기

mpg<- as.data.frame(ggplot2::mpg)
dim(mpg)

## [1] 234  11

head(mpg, 5)

##   manufacturer model displ year cyl      trans drv cty hwy fl   class
## 1         audi    a4   1.8 1999   4   auto(l5)   f  18  29  p compact
## 2         audi    a4   1.8 1999   4 manual(m5)   f  21  29  p compact
## 3         audi    a4   2.0 2008   4 manual(m6)   f  20  31  p compact
## 4         audi    a4   2.0 2008   4   auto(av)   f  21  30  p compact
## 5         audi    a4   2.8 1999   6   auto(l5)   f  16  26  p compact

tail(mpg, 5)

##     manufacturer  model displ year cyl      trans drv cty hwy fl   class
## 230   volkswagen passat   2.0 2008   4   auto(s6)   f  19  28  p midsize
## 231   volkswagen passat   2.0 2008   4 manual(m6)   f  21  29  p midsize
## 232   volkswagen passat   2.8 1999   6   auto(l5)   f  16  26  p midsize
## 233   volkswagen passat   2.8 1999   6 manual(m5)   f  18  26  p midsize
## 234   volkswagen passat   3.6 2008   6   auto(s6)   f  17  26  p midsize

str(mpg)

## 'data.frame':    234 obs. of  11 variables:
##  $ manufacturer: chr  "audi" "audi" "audi" "audi" ...
##  $ model       : chr  "a4" "a4" "a4" "a4" ...
##  $ displ       : num  1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
##  $ year        : int  1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
##  $ cyl         : int  4 4 4 4 6 6 6 4 4 4 ...
##  $ trans       : chr  "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
##  $ drv         : chr  "f" "f" "f" "f" ...
##  $ cty         : int  18 21 20 21 16 18 18 18 16 20 ...
##  $ hwy         : int  29 29 31 30 26 26 27 26 25 28 ...
##  $ fl          : chr  "p" "p" "p" "p" ...
##  $ class       : chr  "compact" "compact" "compact" "compact" ...

summary(mpg)

##  manufacturer          model               displ            year     
##  Length:234         Length:234         Min.   :1.600   Min.   :1999  
##  Class :character   Class :character   1st Qu.:2.400   1st Qu.:1999  
##  Mode  :character   Mode  :character   Median :3.300   Median :2004  
##                                        Mean   :3.472   Mean   :2004  
##                                        3rd Qu.:4.600   3rd Qu.:2008  
##                                        Max.   :7.000   Max.   :2008  
##       cyl           trans               drv                 cty       
##  Min.   :4.000   Length:234         Length:234         Min.   : 9.00  
##  1st Qu.:4.000   Class :character   Class :character   1st Qu.:14.00  
##  Median :6.000   Mode  :character   Mode  :character   Median :17.00  
##  Mean   :5.889                                         Mean   :16.86  
##  3rd Qu.:8.000                                         3rd Qu.:19.00  
##  Max.   :8.000                                         Max.   :35.00  
##       hwy             fl               class          
##  Min.   :12.00   Length:234         Length:234        
##  1st Qu.:18.00   Class :character   Class :character  
##  Median :24.00   Mode  :character   Mode  :character  
##  Mean   :23.44                                        
##  3rd Qu.:27.00                                        
##  Max.   :44.00

변수명 바꾸기

install.packages("dplyr")

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

mpg.new <- mpg
mpg.new  <- rename(mpg.new, city = cty, highway = hwy)
head(mpg.new, 5)

##   manufacturer model displ year cyl      trans drv city highway fl   class
## 1         audi    a4   1.8 1999   4   auto(l5)   f   18      29  p compact
## 2         audi    a4   1.8 1999   4 manual(m5)   f   21      29  p compact
## 3         audi    a4   2.0 2008   4 manual(m6)   f   20      31  p compact
## 4         audi    a4   2.0 2008   4   auto(av)   f   21      30  p compact
## 5         audi    a4   2.8 1999   6   auto(l5)   f   16      26  p compact

파생변수 만들기 및 활용

mpg.new$total <- (mpg.new$highway + mpg.new$city)/2
mpg.new$test2 <- ifelse(mpg.new$total >= 30, "A", 
                        ifelse(mpg.new$total >= 25, "B", 
                               ifelse(mpg.new$total >= 20, "C", "D")))
table(mpg.new$test2)

## 
##   A   B   C   D 
##  10  33  85 106

table(mpg.new$test2, mpg.new$year)

##    
##     1999 2008
##   A    5    5
##   B   14   19
##   C   46   39
##   D   52   54

qplot(mpg.new$test2)

Week 7

데이터 가공하기1

setwd("C:\\Users\\user\\Desktop\\R")
exam <- read.csv("csv_exam.csv")
exam %>% filter(class == 1) #filter: extracting row, %>%: 파이프, ctrl+shift+M

##   id class math english science
## 1  1     1   50      98      50
## 2  2     1   60      97      60
## 3  3     1   45      86      78
## 4  4     1   30      98      58

exam %>% select(id, math) %>% head(10) #select: extracting column

##    id math
## 1   1   50
## 2   2   60
## 3   3   45
## 4   4   30
## 5   5   25
## 6   6   50
## 7   7   80
## 8   8   90
## 9   9   20
## 10 10   50

exam %>% arrange(math) %>% head(3)

##   id class math english science
## 1  9     3   20      98      15
## 2  5     2   25      80      65
## 3  4     1   30      98      58

exam %>% mutate(total = math + english + science) %>% head(5)

##   id class math english science total
## 1  1     1   50      98      50   198
## 2  2     1   60      97      60   217
## 3  3     1   45      86      78   209
## 4  4     1   30      98      58   186
## 5  5     2   25      80      65   170

Week 9

데이터 가공하기2

test1 <- data.frame(id = c(1, 2, 3, 4, 5),
                    midterm = c(60, 80, 70, 90, 85))
test2 <- data.frame(id = c(1, 2, 3, 4, 5),
                    final = c(70, 83, 65, 95, 80))

total <- left_join(test1, test2, by = "id")
total

##   id midterm final
## 1  1      60    70
## 2  2      80    83
## 3  3      70    65
## 4  4      90    95
## 5  5      85    80

group_a <- data.frame(id = c(1, 2, 3, 4, 5),
                      test = c(60, 80, 70, 90, 85))
group_b <- data.frame(id = c(6, 7, 8, 9, 10),
                      test = c(70, 83, 65, 95, 80))
group_all <- bind_rows(group_a, group_b)
group_all

##    id test
## 1   1   60
## 2   2   80
## 3   3   70
## 4   4   90
## 5   5   85
## 6   6   70
## 7   7   83
## 8   8   65
## 9   9   95
## 10 10   80

결측치 정제하기

df <- data.frame(sex = c("M", "F", NA, "M", "F"), 
                 score = c(5, 4, 3, 4, NA))
df

##    sex score
## 1    M     5
## 2    F     4
## 3 <NA>     3
## 4    M     4
## 5    F    NA

is.na(df)

##        sex score
## [1,] FALSE FALSE
## [2,] FALSE FALSE
## [3,]  TRUE FALSE
## [4,] FALSE FALSE
## [5,] FALSE  TRUE

table(is.na(df))

## 
## FALSE  TRUE 
##     8     2

table(is.na(df$sex))

## 
## FALSE  TRUE 
##     4     1

table(is.na(df$score))

## 
## FALSE  TRUE 
##     4     1

df_nomiss <- df %>% filter(!is.na(score) & !is.na(sex))
df_nomiss

##   sex score
## 1   M     5
## 2   F     4
## 3   M     4

df_nomiss2 <- na.omit(df)
df_nomiss2

##   sex score
## 1   M     5
## 2   F     4
## 4   M     4

setwd("C:\\Users\\user\\Desktop\\R")
exam <- read.csv("csv_exam.csv")
exam[c(3, 8, 15), "math"] <- NA
exam %>% summarise(mean_math = mean(math, na.rm = T))

##   mean_math
## 1  55.23529

exam$math <- ifelse(is.na(exam$math), 55, exam$math)
table(is.na(exam$math))

## 
## FALSE 
##    20

이상치 정제하기

boxplot(mpg$hwy)$stats

##      [,1]
## [1,]   12
## [2,]   18
## [3,]   24
## [4,]   27
## [5,]   37
## attr(,"class")
##         1 
## "integer"

mpg$hwy <- ifelse(mpg$hwy < 12 | mpg$hwy > 37, NA, mpg$hwy)
table(is.na(mpg$hwy))

## 
## FALSE  TRUE 
##   231     3

mpg %>% group_by(drv) %>% summarise(mean_hwy = mean(hwy, na.rm = T))

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 3 x 2
##   drv   mean_hwy
##   <chr>    <dbl>
## 1 4         19.2
## 2 f         27.7
## 3 r         21

그래프 만들기

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point()

## Warning: Removed 3 rows containing missing values (geom_point).

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + xlim(3, 6) + ylim(10, 30)

## Warning: Removed 105 rows containing missing values (geom_point).

df_mpg <- mpg %>% group_by(drv) %>% summarise(mean_hwy = mean(hwy, na.rm = T))

## `summarise()` ungrouping output (override with `.groups` argument)

df_mpg

## # A tibble: 3 x 2
##   drv   mean_hwy
##   <chr>    <dbl>
## 1 4         19.2
## 2 f         27.7
## 3 r         21

ggplot(data = df_mpg, aes(x = drv, y = mean_hwy)) + geom_col()

ggplot(data = df_mpg, aes(x = reorder(drv, -mean_hwy), y = mean_hwy)) + geom_col()

ggplot(data = mpg, aes(x = drv)) + geom_bar()

ggplot(data = mpg, aes(x = hwy)) + geom_bar()

## Warning: Removed 3 rows containing non-finite values (stat_count).

ggplot(data = economics, aes(x = date, y = unemploy)) + geom_line()

Week 11

한국 복지 패널 데이터 분석 준비하기

install.packages("foreign")

library(foreign)
library(dplyr)
library(ggplot2)
library(readxl)

setwd("C:\\Users\\user\\Desktop\\R")
raw_welfare <- read.spss(file = "Koweps_hpc10_2015_beta1.sav", to.data.frame = T)

## Warning in read.spss(file = "Koweps_hpc10_2015_beta1.sav", to.data.frame = T):
## Koweps_hpc10_2015_beta1.sav: Compression bias (0) is not the usual value of 100

welfare <- raw_welfare

dim(welfare)

## [1] 16664   957

welfare <- rename(welfare, 
                  sex = h10_g3, #성별
                  birth = h10_g4, #태어난 연도 
                  marriage = h10_g10, #혼인 상태
                  religion = h10_g11, #종교
                  income = p1002_8aq1, #월급
                  code_job = h10_eco9, #직업 코드
                  code_region = h10_reg7) #지역 코드

성별에 따른 월급 차이

#성별 변수 검토
class(welfare$sex)

## [1] "numeric"

table(welfare$sex)

## 
##    1    2 
## 7578 9086

ifelse(welfare$sex == 1, "male", "female") -> welfare$sex
qplot(welfare$sex)

#월급 변수 검토
class(welfare$income)

## [1] "numeric"

summary(welfare$income)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0   122.0   192.5   241.6   316.6  2400.0   12030

qplot(welfare$income) + xlim(0, 1000)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 12051 rows containing non-finite values (stat_bin).

## Warning: Removed 2 rows containing missing values (geom_bar).

#결측치 제거하기 
ifelse(welfare$income %in% c(0, 9999), NA, welfare$income) -> welfare$income
table(is.na(welfare$income))

## 
## FALSE  TRUE 
##  4620 12044

#성별에 따른 월급 차이 분석하기 
welfare %>% filter(!is.na(income)) %>% group_by(sex) %>% summarise(mean_income = mean(income)) -> sex_income

## `summarise()` ungrouping output (override with `.groups` argument)

sex_income

## # A tibble: 2 x 2
##   sex    mean_income
##   <chr>        <dbl>
## 1 female        163.
## 2 male          312.

ggplot(data = sex_income, aes(x = sex, y = mean_income)) + geom_col()

나이 및 연령대에 따른 월급 차이

#나이 변수 검토
class(welfare$birth)

## [1] "numeric"

summary(welfare$birth)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1907    1946    1966    1968    1988    2014

qplot(welfare$birth)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

welfare$age <- 2015 - welfare$birth + 1 #나이 파생변수 만들기
summary(welfare$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00   28.00   50.00   48.43   70.00  109.00

qplot(welfare$age)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#나이에 따른 월급 차이 분석하기 
welfare %>% filter(!is.na(income)) %>% group_by(age) %>% summarise(mean_income = mean(income)) -> age_income

## `summarise()` ungrouping output (override with `.groups` argument)

age_income

## # A tibble: 69 x 2
##      age mean_income
##    <dbl>       <dbl>
##  1    20        121.
##  2    21        106.
##  3    22        130.
##  4    23        142.
##  5    24        134.
##  6    25        145.
##  7    26        158.
##  8    27        188.
##  9    28        205.
## 10    29        189.
## # ... with 59 more rows

ggplot(data = age_income, aes(x = age, y = mean_income)) + geom_line()

#연령대에 따른 월급 차이 분석하기 
welfare$ageg <- ifelse(welfare$age < 30, "young", 
                       ifelse(welfare$age <= 59, "middle", "old")) #연령대 파생변수 만들기
table(welfare$ageg)

## 
## middle    old  young 
##   6049   6281   4334

qplot(welfare$ageg)

welfare %>% filter(!is.na(income)) %>% group_by(ageg) %>% summarise(mean_income = mean(income)) -> ageg_income

## `summarise()` ungrouping output (override with `.groups` argument)

ageg_income

## # A tibble: 3 x 2
##   ageg   mean_income
##   <chr>        <dbl>
## 1 middle        282.
## 2 old           125.
## 3 young         164.

ggplot(data = ageg_income, aes(x = ageg, y = mean_income)) + geom_col() + scale_x_discrete(limit = c("young", "middle", "old"))

성별 및 연령대, 나이에 따른 월급 차이

# 성별 및 연령대에 따른 월급 차이
welfare %>% filter(!is.na(income)) %>% group_by(ageg, sex) %>% summarise(mean_income = mean(income)) -> sex_income

## `summarise()` regrouping output by 'ageg' (override with `.groups` argument)

sex_income

## # A tibble: 6 x 3
## # Groups:   ageg [3]
##   ageg   sex    mean_income
##   <chr>  <chr>        <dbl>
## 1 middle female       188. 
## 2 middle male         353. 
## 3 old    female        81.5
## 4 old    male         174. 
## 5 young  female       160. 
## 6 young  male         171.

ggplot(data = sex_income, aes(x = ageg, y = mean_income, fill = sex)) + geom_col(position = "dodge") + scale_x_discrete(limit = c("young", "middle", "old"))

# 성별 및 나이에 따른 월급 차이
welfare %>% filter(!is.na(income)) %>% group_by(age, sex) %>% summarise(mean_income = mean(income)) -> sex_age

## `summarise()` regrouping output by 'age' (override with `.groups` argument)

sex_income

## # A tibble: 6 x 3
## # Groups:   ageg [3]
##   ageg   sex    mean_income
##   <chr>  <chr>        <dbl>
## 1 middle female       188. 
## 2 middle male         353. 
## 3 old    female        81.5
## 4 old    male         174. 
## 5 young  female       160. 
## 6 young  male         171.

ggplot(data = sex_age, aes(x = age, y = mean_income, col = sex)) + geom_line()

직업별 월급 차이

class(welfare$code_job)

## [1] "numeric"

table(welfare$code_job)

## 
##  111  120  131  132  133  134  135  139  141  149  151  152  153  159  211  212 
##    2   16   10   11    9    3    7   10   35   20   26   18   15   16    8    4 
##  213  221  222  223  224  231  232  233  234  235  236  237  239  241  242  243 
##    3   17   31   12    4   41    5    3    6   48   14    2   29   12    4   63 
##  244  245  246  247  248  251  252  253  254  259  261  271  272  273  274  281 
##    4   33   59   77   38   14  111   24   67  109    4   15   11    4   36   17 
##  283  284  285  286  289  311  312  313  314  320  330  391  392  399  411  412 
##    8   10   26   16    5  140  260  220   84   75   15    4   13   87   47   12 
##  421  422  423  429  431  432  441  442  510  521  522  530  611  612  613  620 
##  124   71    5   14   20   33  154  197  192  353    5  106 1320   11   40    2 
##  630  710  721  722  730  741  742  743  751  752  753  761  762  771  772  773 
##   20   29   30   22   16   27    3   34   34    5   49   69   27   11   61   86 
##  774  780  791  792  799  811  812  819  821  822  823  831  832  841  842  843 
##    7   17    5   21   45   16    1    6    9    9   23    5   17   32   10    4 
##  851  852  853  854  855  861  862  863  864  871  873  874  875  876  881  882 
##   19   13    7   33    9    3   14   17   31    2  257   34   37    2    2    3 
##  891  892  899  910  921  922  930  941  942  951  952  953  991  992  999 1011 
##    8   19   16  102   31   74  289  325   99  125  122   73   45   12  141    2 
## 1012 
##   17

library(readxl)

#데이터 전처리
setwd("C:\\Users\\user\\Desktop\\R")
list_job <- read_excel("Koweps_Codebook.xlsx", col_names = T, sheet = 2)
head(list_job)

## # A tibble: 6 x 2
##   code_job job                                
##      <dbl> <chr>                              
## 1      111 의회의원 고위공무원 및 공공단체임원
## 2      112 기업고위임원                       
## 3      120 행정 및 경영지원 관리자            
## 4      131 연구 교육 및 법률 관련 관리자      
## 5      132 보험 및 금융 관리자                
## 6      133 보건 및 사회복지 관련 관리자

dim(list_job)

## [1] 149   2

welfare <- left_join(welfare, list_job, id = "code_job")

## Joining, by = "code_job"

welfare %>% filter(!is.na(code_job)) %>% select(code_job, job) %>% head(10)

##    code_job                                job
## 1       942                   경비원 및 검표원
## 2       762                             전기공
## 3       530 방문 노점 및 통신 판매 관련 종사자
## 4       999        기타 서비스관련 단순 종사원
## 5       312                    경영관련 사무원
## 6       254             문리 기술 및 예능 강사
## 7       510                        영업 종사자
## 8       530 방문 노점 및 통신 판매 관련 종사자
## 9       286   스포츠 및 레크레이션 관련 전문가
## 10      521                   매장 판매 종사자

#직업별 상위 10위, 하위 10위의 월급 평균 구하기 
job_income <- welfare %>% filter(!is.na(job) & !is.na(income)) %>% group_by(job) %>% summarise(mean_income = mean(income)) %>% arrange(desc(mean_income))

## `summarise()` ungrouping output (override with `.groups` argument)

top10 <- job_income %>% head(10)
bottom10 <- job_income %>% tail(10)
ggplot(data = top10, aes(x = reorder(job, mean_income), y = mean_income)) + geom_col() + coord_flip()

ggplot(data = bottom10, aes(x = reorder(job, -mean_income), y = mean_income)) + geom_col() + coord_flip() + ylim(0, 850)

성별 직업 빈도

male_job <- welfare %>% filter(!is.na(job) & sex == "male") %>% group_by(job) %>% summarise(n = n()) %>% arrange(desc(n))

## `summarise()` ungrouping output (override with `.groups` argument)

male_top10 <- male_job %>% head(10)

female_job <- welfare %>% filter(!is.na(job) & sex == "female") %>% group_by(job) %>% summarise(n = n()) %>% arrange(desc(n))

## `summarise()` ungrouping output (override with `.groups` argument)

female_top10 <- female_job %>% head(10)

ggplot(data = male_top10, aes(x = reorder(job, n), y = n)) + geom_col() + coord_flip()

ggplot(data = female_top10, aes(x = reorder(job, n), y = n)) + geom_col() + coord_flip()

종교 유무에 따른 이혼율

class(welfare$religion)

## [1] "numeric"

table(welfare$religion)

## 
##    1    2 
## 8047 8617

class(welfare$marriage)

## [1] "numeric"

table(welfare$marriage)

## 
##    0    1    2    3    4    5    6 
## 2861 8431 2117  712   84 2433   26

#데이터 전처리
welfare$religion <- ifelse(welfare$religion == 1, "yes", "no")
qplot(welfare$religion)

welfare$group_marriage <- ifelse(welfare$marriage == 1, "marriage", 
                                ifelse(welfare$marriage == 3, "divorce", NA))
table(welfare$group_marriage)

## 
##  divorce marriage 
##      712     8431

qplot(welfare$group_marriage)

religion_marriage <- welfare %>% filter(!is.na(group_marriage)) %>% group_by(religion, group_marriage) %>% summarise(n = n()) %>% mutate(tot_group = sum(n)) %>% mutate(pct = round(n/tot_group*100, 1))

## `summarise()` regrouping output by 'religion' (override with `.groups` argument)

divorce <- religion_marriage %>% filter(group_marriage == "divorce") %>% select(religion, pct) 

#그래프 그리기 
ggplot(data = divorce, aes(x = religion, y = pct)) + geom_col()

#연령대별 이혼율 분석
ageg_marriage <- welfare %>% filter(!is.na(group_marriage)) %>% group_by(ageg, group_marriage) %>% summarise(n = n()) %>% mutate(tot_group = sum(n)) %>% mutate(pct = round(n/tot_group*100, 1))

## `summarise()` regrouping output by 'ageg' (override with `.groups` argument)

ageg_divorce <- ageg_marriage %>% filter(group_marriage == "divorce") %>% select(ageg, pct)
ggplot(data = ageg_divorce, aes(x = ageg, y = pct)) + geom_col()

#연령대 및 종교 유무에 따른 이혼율 분석
ageg_religion_marriage <- welfare %>% filter(!is.na(group_marriage)) %>% group_by(ageg, religion, group_marriage) %>% summarise(n = n()) %>% mutate(tot_group = sum(n)) %>% mutate(pct = round(n/tot_group*100, 1))

## `summarise()` regrouping output by 'ageg', 'religion' (override with `.groups` argument)

ageg_religion_divorce <- ageg_religion_marriage %>% filter(group_marriage == "divorce") %>% select(ageg, religion, pct)
ggplot(data = ageg_religion_divorce, aes(x = ageg, y = pct, fill = religion)) + geom_col(position = "dodge")

지역별 연령대 비율

class(welfare$code_region)

## [1] "numeric"

table(welfare$code_region)

## 
##    1    2    3    4    5    6    7 
## 2486 3711 2785 2036 1467 1257 2922

#데이터 전처리
list_region <- data.frame(code_region = c(1:7), 
                          region = c("서울", "수도권(인천/경기)", "부산/경남/울산", "대구/경북", "대전/충남", "강원/충북", "광주/전남/전북/제주도"))
welfare <- left_join(welfare, list_region, by = "code_region")

region_ageg <- welfare %>% group_by(region, ageg) %>% summarise(n = n()) %>% mutate(tot_group = sum(n)) %>% mutate(pct = round(n/tot_group*100, 2))

## `summarise()` regrouping output by 'region' (override with `.groups` argument)

#그래프 그리기
ggplot(data = region_ageg, aes(x = region, y = pct, fill = ageg)) + geom_col() + coord_flip()

#노년층 비율이 높은 순으로 정렬하기
list_order_old <- region_ageg %>% filter(ageg == "old") %>% arrange(pct) 
order <- list_order_old$region
ggplot(data = region_ageg, aes(x = region, y = pct, fill = ageg)) + geom_col() + coord_flip() + scale_x_discrete(limits = order)

#연령대 순으로 막대 색깔 나열하기
region_ageg$ageg <- factor(region_ageg$ageg, level = c("old", "middle", "young"))
ggplot(data = region_ageg, aes(x = region, y = pct, fill = ageg)) + geom_col() + coord_flip() + scale_x_discrete(limits = order)

Week 12

install.packages("stringr")
install.packages("wordcloud")

library(KoNLP)

## Checking user defined dictionary!

library(stringr)
library(dplyr)
library(wordcloud)

## Loading required package: RColorBrewer

library(RColorBrewer)

###힙합 가사 텍스트 마이닝

setwd("C:\\Users\\user\\Desktop\\R")
txt <- readLines("hiphop.txt")
head(txt)

## [1] "\"보고 싶다"                  "이렇게 말하니까 더 보고 싶다"
## [3] "너희 사진을 보고 있어도"      "보고 싶다"                   
## [5] "너무 야속한 시간"             "나는 우리가 밉다"

txt <- str_replace_all(txt, "\\W", " ") # gsub(" ", "\\W", txt)로 돌려도 될 것 같음
extractNoun("대한민국의 영토는 한반도와 그 부속도서로 한다")

## [1] "대한"     "민국"     "영토"     "한반도와" "부속도서" "한"

#가장 많이 사용된 단어 알아보기 
nouns <- extractNoun(txt) # 가사에서 명사 추출, strsplit와 비슷한데 명사만 뽑아진 것 같다.  
wordcount <- table(unlist(nouns)) 
df_word <- as.data.frame(wordcount, stringsAsFactors = F)
df_word <- rename(df_word, word = Var1, freq = Freq)
df_word <- filter(df_word, nchar(word) >= 2)
top_20 <- df_word %>% arrange(desc(freq)) %>% head(20)

#워드 클라우드 만들기
pal <- brewer.pal(8, "Dark2")
set.seed(1234)
wordcloud(words = df_word$word,
          freq = df_word$freq, 
          min.freq = 2, 
          max.words = 200, 
          random.order = F, 
          rot.per = .1, 
          scale = c(4, 0.3), 
          colors = pal)

###국정원 트윗 텍스트 마이닝

#데이터 정제하기
setwd("C:\\Users\\user\\Desktop\\R")
twitter <- read.csv("twitter.csv", header = T, stringsAsFactors = F, fileEncoding = "UTF-8")
twitter <- rename(twitter, no = 번호, id = 계정이름, date = 작성일, tw = 내용)
twitter$tw <- str_replace_all(twitter$tw, "\\W", " ")

#가장 많이 사용된 단어 알아보기 
nouns <- extractNoun(twitter$tw)
wordcount <- table(unlist(nouns)) 
df_word <- as.data.frame(wordcount, stringsAsFactors = F)
df_word <- rename(df_word, word = Var1, freq = Freq)
df_word <- filter(df_word, nchar(word) >= 2)
top_20 <- df_word %>% arrange(desc(freq)) %>% head(20)

#단어 빈도 그래프 만들기
order <- arrange(top_20, freq)$word
ggplot(data = top_20, aes(x = word, y = freq)) + ylim(0, 2500) + geom_col() + coord_flip() + scale_x_discrete(limit = order) + geom_text(aes(label = freq), hjust = -0.3)

#wordcloud 만들기
pal <- brewer.pal(8, "Dark2")
set.seed(1234)
wordcloud(words = df_word$word,
          freq = df_word$freq, 
          min.freq = 10, 
          max.words = 200, 
          random.order = F, 
          rot.per = .1, 
          scale = c(6, 0.2), 
          colors = pal)

pal <- brewer.pal(9, "Blues")[5:9]
set.seed(1234)
wordcloud(words = df_word$word,
          freq = df_word$freq, 
          min.freq = 10, 
          max.words = 200, 
          random.order = F, 
          rot.per = .1, 
          scale = c(6, 0.2), 
          colors = pal)

R(4-2)

김수한

2020 9 25

Week 2

변수 만들기

문자를 다루는 함수 이용하기

ggplot2 패키지 설치 후 함수 사용하기

데이터 프레임 만들기

엑셀 파일 불러오기

csv, rds 파일 활용하기

Week 4

데이터 파악하기

변수명 바꾸기

파생변수 만들기 및 활용

Week 7

데이터 가공하기1

Week 9

데이터 가공하기2

결측치 정제하기

이상치 정제하기

그래프 만들기

Week 11

한국 복지 패널 데이터 분석 준비하기

성별에 따른 월급 차이

나이 및 연령대에 따른 월급 차이

성별 및 연령대, 나이에 따른 월급 차이

직업별 월급 차이

성별 직업 빈도

종교 유무에 따른 이혼율

지역별 연령대 비율

Week 12