Week 2

λ³€μˆ˜ λ§Œλ“€κΈ°

a <- 1
b <- 2
c <- 3
d <- 3.5
var1 <- c(1, 2, 5, 7, 8)
var2 <- c(1:5)
var3 <- seq(1,10, by = 2)
var4 <- seq(10,100, by = 7)
class <- c("사과", "λ°°", "μ˜€λ Œμ§€", "μžλ‘", "κ·€", "λ°”λ‚˜λ‚˜")
str4 <- c("a", "b", "c")
str5 <- c("Hello", "world", "is", "good!")

문자λ₯Ό λ‹€λ£¨λŠ” ν•¨μˆ˜ μ΄μš©ν•˜κΈ°

paste(str5, collapse = " ")
## [1] "Hello world is good!"

ggplot2 νŒ¨ν‚€μ§€ μ„€μΉ˜ ν›„ ν•¨μˆ˜ μ‚¬μš©ν•˜κΈ°

install.packages("ggplot2")
library(ggplot2)
x <- c("a", "a", "b", "c")
qplot(x)

qplot(data = mpg, x = hwy) #hwy: μ—°λΉ„ 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qplot(data = mpg, x = drv, y = hwy, geom = "boxplot", col=drv)

qplot(mpg, wt, data = mtcars)

qplot(mpg, wt, data = mtcars, colour = cyl)

qplot(mpg, wt, data = mtcars, size = cyl)

qplot(mpg, wt, data = mtcars, facets = vs ~ am)

## Week 3

데이터 ν”„λ ˆμž„ λ§Œλ“€κΈ°

english <- c(90, 80, 70, 60)
math <- c(50, 60, 100, 20)
class<- c(1, 1, 2, 2)
df_midterm <- data.frame(english = c(90, 80, 70, 60), 
                         math = c(50, 60, 100, 20), 
                         class = c(1, 1, 2, 2))
mean(df_midterm$english)
## [1] 75
mean(df_midterm$math)
## [1] 57.5
df_fruit<- data.frame(goods = c("사과", "λ”ΈκΈ°", "μˆ˜λ°•"),
                      price = c(1800, 1500, 3000),
                      sell = c(24, 38, 13))
mean(df_fruit$price)
## [1] 2100
mean(df_fruit$sell)
## [1] 25

μ—‘μ…€ 파일 뢈러였기

install.packages("readxl")
library(readxl)

setwd("C:\\Users\\user\\Desktop\\R")

df_exam <- read_excel("excel_exam.xlsx")
mean(df_exam$math)
## [1] 57.45
mean(df_exam$english)
## [1] 84.9
mean(df_exam$science)
## [1] 59.45
df_exam_novar <- read_excel("excel_exam_novar.xlsx", col_names = F) #첫번째 행이 λ³€μˆ˜λͺ…이 μ•„λ‹Œ 경우
## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5
df_exam_sheet <- read_excel("excel_exam_sheet.xlsx", sheet = 3) #μ‹œνŠΈκ°€ μ—¬λŸ¬ 개일 경우

csv, rds 파일 ν™œμš©ν•˜κΈ°

setwd("C:\\Users\\user\\Desktop\\R")

df_csv_exam <- read.csv("csv_exam.csv", stringsAsFactors = F)
str(df_csv_exam)
## 'data.frame':    20 obs. of  5 variables:
##  $ id     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ class  : int  1 1 1 1 2 2 2 2 3 3 ...
##  $ math   : int  50 60 45 30 25 50 80 90 20 50 ...
##  $ english: int  98 97 86 98 80 89 90 78 98 98 ...
##  $ science: int  50 60 78 58 65 98 45 25 15 45 ...
write.csv(df_midterm, file = "df_midterm.csv")

saveRDS(df_midterm, file = "df_midterm.rds")
rm(df_midterm)  
readRDS("df_midterm.rds") #Rμ—μ„œ μ§€μ›Œλ„ 볡ꡬ κ°€λŠ₯
##   english math class
## 1      90   50     1
## 2      80   60     1
## 3      70  100     2
## 4      60   20     2

Week 4

데이터 νŒŒμ•…ν•˜κΈ°

mpg<- as.data.frame(ggplot2::mpg)
dim(mpg)
## [1] 234  11
head(mpg, 5)
##   manufacturer model displ year cyl      trans drv cty hwy fl   class
## 1         audi    a4   1.8 1999   4   auto(l5)   f  18  29  p compact
## 2         audi    a4   1.8 1999   4 manual(m5)   f  21  29  p compact
## 3         audi    a4   2.0 2008   4 manual(m6)   f  20  31  p compact
## 4         audi    a4   2.0 2008   4   auto(av)   f  21  30  p compact
## 5         audi    a4   2.8 1999   6   auto(l5)   f  16  26  p compact
tail(mpg, 5)
##     manufacturer  model displ year cyl      trans drv cty hwy fl   class
## 230   volkswagen passat   2.0 2008   4   auto(s6)   f  19  28  p midsize
## 231   volkswagen passat   2.0 2008   4 manual(m6)   f  21  29  p midsize
## 232   volkswagen passat   2.8 1999   6   auto(l5)   f  16  26  p midsize
## 233   volkswagen passat   2.8 1999   6 manual(m5)   f  18  26  p midsize
## 234   volkswagen passat   3.6 2008   6   auto(s6)   f  17  26  p midsize
str(mpg)
## 'data.frame':    234 obs. of  11 variables:
##  $ manufacturer: chr  "audi" "audi" "audi" "audi" ...
##  $ model       : chr  "a4" "a4" "a4" "a4" ...
##  $ displ       : num  1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
##  $ year        : int  1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
##  $ cyl         : int  4 4 4 4 6 6 6 4 4 4 ...
##  $ trans       : chr  "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
##  $ drv         : chr  "f" "f" "f" "f" ...
##  $ cty         : int  18 21 20 21 16 18 18 18 16 20 ...
##  $ hwy         : int  29 29 31 30 26 26 27 26 25 28 ...
##  $ fl          : chr  "p" "p" "p" "p" ...
##  $ class       : chr  "compact" "compact" "compact" "compact" ...
summary(mpg)
##  manufacturer          model               displ            year     
##  Length:234         Length:234         Min.   :1.600   Min.   :1999  
##  Class :character   Class :character   1st Qu.:2.400   1st Qu.:1999  
##  Mode  :character   Mode  :character   Median :3.300   Median :2004  
##                                        Mean   :3.472   Mean   :2004  
##                                        3rd Qu.:4.600   3rd Qu.:2008  
##                                        Max.   :7.000   Max.   :2008  
##       cyl           trans               drv                 cty       
##  Min.   :4.000   Length:234         Length:234         Min.   : 9.00  
##  1st Qu.:4.000   Class :character   Class :character   1st Qu.:14.00  
##  Median :6.000   Mode  :character   Mode  :character   Median :17.00  
##  Mean   :5.889                                         Mean   :16.86  
##  3rd Qu.:8.000                                         3rd Qu.:19.00  
##  Max.   :8.000                                         Max.   :35.00  
##       hwy             fl               class          
##  Min.   :12.00   Length:234         Length:234        
##  1st Qu.:18.00   Class :character   Class :character  
##  Median :24.00   Mode  :character   Mode  :character  
##  Mean   :23.44                                        
##  3rd Qu.:27.00                                        
##  Max.   :44.00

λ³€μˆ˜λͺ… λ°”κΎΈκΈ°

install.packages("dplyr")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
mpg.new <- mpg
mpg.new  <- rename(mpg.new, city = cty, highway = hwy)
head(mpg.new, 5)
##   manufacturer model displ year cyl      trans drv city highway fl   class
## 1         audi    a4   1.8 1999   4   auto(l5)   f   18      29  p compact
## 2         audi    a4   1.8 1999   4 manual(m5)   f   21      29  p compact
## 3         audi    a4   2.0 2008   4 manual(m6)   f   20      31  p compact
## 4         audi    a4   2.0 2008   4   auto(av)   f   21      30  p compact
## 5         audi    a4   2.8 1999   6   auto(l5)   f   16      26  p compact

νŒŒμƒλ³€μˆ˜ λ§Œλ“€κΈ° 및 ν™œμš©

mpg.new$total <- (mpg.new$highway + mpg.new$city)/2
mpg.new$test2 <- ifelse(mpg.new$total >= 30, "A", 
                        ifelse(mpg.new$total >= 25, "B", 
                               ifelse(mpg.new$total >= 20, "C", "D")))
table(mpg.new$test2)
## 
##   A   B   C   D 
##  10  33  85 106
table(mpg.new$test2, mpg.new$year)
##    
##     1999 2008
##   A    5    5
##   B   14   19
##   C   46   39
##   D   52   54
qplot(mpg.new$test2)

Week 7

데이터 κ°€κ³΅ν•˜κΈ°1

setwd("C:\\Users\\user\\Desktop\\R")
exam <- read.csv("csv_exam.csv")
exam %>% filter(class == 1) #filter: extracting row, %>%: νŒŒμ΄ν”„, ctrl+shift+M 
##   id class math english science
## 1  1     1   50      98      50
## 2  2     1   60      97      60
## 3  3     1   45      86      78
## 4  4     1   30      98      58
exam %>% select(id, math) %>% head(10) #select: extracting column
##    id math
## 1   1   50
## 2   2   60
## 3   3   45
## 4   4   30
## 5   5   25
## 6   6   50
## 7   7   80
## 8   8   90
## 9   9   20
## 10 10   50
exam %>% arrange(math) %>% head(3)
##   id class math english science
## 1  9     3   20      98      15
## 2  5     2   25      80      65
## 3  4     1   30      98      58
exam %>% mutate(total = math + english + science) %>% head(5)
##   id class math english science total
## 1  1     1   50      98      50   198
## 2  2     1   60      97      60   217
## 3  3     1   45      86      78   209
## 4  4     1   30      98      58   186
## 5  5     2   25      80      65   170

Week 9

데이터 κ°€κ³΅ν•˜κΈ°2

test1 <- data.frame(id = c(1, 2, 3, 4, 5),
                    midterm = c(60, 80, 70, 90, 85))
test2 <- data.frame(id = c(1, 2, 3, 4, 5),
                    final = c(70, 83, 65, 95, 80))

total <- left_join(test1, test2, by = "id")
total
##   id midterm final
## 1  1      60    70
## 2  2      80    83
## 3  3      70    65
## 4  4      90    95
## 5  5      85    80
group_a <- data.frame(id = c(1, 2, 3, 4, 5),
                      test = c(60, 80, 70, 90, 85))
group_b <- data.frame(id = c(6, 7, 8, 9, 10),
                      test = c(70, 83, 65, 95, 80))
group_all <- bind_rows(group_a, group_b)
group_all
##    id test
## 1   1   60
## 2   2   80
## 3   3   70
## 4   4   90
## 5   5   85
## 6   6   70
## 7   7   83
## 8   8   65
## 9   9   95
## 10 10   80

결츑치 μ •μ œν•˜κΈ°

df <- data.frame(sex = c("M", "F", NA, "M", "F"), 
                 score = c(5, 4, 3, 4, NA))
df
##    sex score
## 1    M     5
## 2    F     4
## 3 <NA>     3
## 4    M     4
## 5    F    NA
is.na(df)
##        sex score
## [1,] FALSE FALSE
## [2,] FALSE FALSE
## [3,]  TRUE FALSE
## [4,] FALSE FALSE
## [5,] FALSE  TRUE
table(is.na(df))
## 
## FALSE  TRUE 
##     8     2
table(is.na(df$sex))
## 
## FALSE  TRUE 
##     4     1
table(is.na(df$score))
## 
## FALSE  TRUE 
##     4     1
df_nomiss <- df %>% filter(!is.na(score) & !is.na(sex))
df_nomiss
##   sex score
## 1   M     5
## 2   F     4
## 3   M     4
df_nomiss2 <- na.omit(df)
df_nomiss2
##   sex score
## 1   M     5
## 2   F     4
## 4   M     4
setwd("C:\\Users\\user\\Desktop\\R")
exam <- read.csv("csv_exam.csv")
exam[c(3, 8, 15), "math"] <- NA
exam %>% summarise(mean_math = mean(math, na.rm = T))
##   mean_math
## 1  55.23529
exam$math <- ifelse(is.na(exam$math), 55, exam$math)
table(is.na(exam$math))
## 
## FALSE 
##    20

μ΄μƒμΉ˜ μ •μ œν•˜κΈ°

boxplot(mpg$hwy)$stats

##      [,1]
## [1,]   12
## [2,]   18
## [3,]   24
## [4,]   27
## [5,]   37
## attr(,"class")
##         1 
## "integer"
mpg$hwy <- ifelse(mpg$hwy < 12 | mpg$hwy > 37, NA, mpg$hwy)
table(is.na(mpg$hwy))
## 
## FALSE  TRUE 
##   231     3
mpg %>% group_by(drv) %>% summarise(mean_hwy = mean(hwy, na.rm = T))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 3 x 2
##   drv   mean_hwy
##   <chr>    <dbl>
## 1 4         19.2
## 2 f         27.7
## 3 r         21

κ·Έλž˜ν”„ λ§Œλ“€κΈ°

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point()
## Warning: Removed 3 rows containing missing values (geom_point).

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + xlim(3, 6) + ylim(10, 30)
## Warning: Removed 105 rows containing missing values (geom_point).

df_mpg <- mpg %>% group_by(drv) %>% summarise(mean_hwy = mean(hwy, na.rm = T))
## `summarise()` ungrouping output (override with `.groups` argument)
df_mpg
## # A tibble: 3 x 2
##   drv   mean_hwy
##   <chr>    <dbl>
## 1 4         19.2
## 2 f         27.7
## 3 r         21
ggplot(data = df_mpg, aes(x = drv, y = mean_hwy)) + geom_col()

ggplot(data = df_mpg, aes(x = reorder(drv, -mean_hwy), y = mean_hwy)) + geom_col()

ggplot(data = mpg, aes(x = drv)) + geom_bar()

ggplot(data = mpg, aes(x = hwy)) + geom_bar()
## Warning: Removed 3 rows containing non-finite values (stat_count).

ggplot(data = economics, aes(x = date, y = unemploy)) + geom_line()

Week 11

ν•œκ΅­ 볡지 νŒ¨λ„ 데이터 뢄석 μ€€λΉ„ν•˜κΈ°

install.packages("foreign")
library(foreign)
library(dplyr)
library(ggplot2)
library(readxl)
setwd("C:\\Users\\user\\Desktop\\R")
raw_welfare <- read.spss(file = "Koweps_hpc10_2015_beta1.sav", to.data.frame = T)
## Warning in read.spss(file = "Koweps_hpc10_2015_beta1.sav", to.data.frame = T):
## Koweps_hpc10_2015_beta1.sav: Compression bias (0) is not the usual value of 100
welfare <- raw_welfare

dim(welfare)
## [1] 16664   957
welfare <- rename(welfare, 
                  sex = h10_g3, #성별
                  birth = h10_g4, #νƒœμ–΄λ‚œ 연도 
                  marriage = h10_g10, #혼인 μƒνƒœ
                  religion = h10_g11, #쒅ꡐ
                  income = p1002_8aq1, #μ›”κΈ‰
                  code_job = h10_eco9, #직업 μ½”λ“œ
                  code_region = h10_reg7) #μ§€μ—­ μ½”λ“œ 

성별에 λ”°λ₯Έ μ›”κΈ‰ 차이

#성별 λ³€μˆ˜ κ²€ν† 
class(welfare$sex) 
## [1] "numeric"
table(welfare$sex)
## 
##    1    2 
## 7578 9086
ifelse(welfare$sex == 1, "male", "female") -> welfare$sex
qplot(welfare$sex)

#μ›”κΈ‰ λ³€μˆ˜ κ²€ν† 
class(welfare$income)
## [1] "numeric"
summary(welfare$income)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0   122.0   192.5   241.6   316.6  2400.0   12030
qplot(welfare$income) + xlim(0, 1000)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 12051 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).

#결츑치 μ œκ±°ν•˜κΈ° 
ifelse(welfare$income %in% c(0, 9999), NA, welfare$income) -> welfare$income
table(is.na(welfare$income))
## 
## FALSE  TRUE 
##  4620 12044
#성별에 λ”°λ₯Έ μ›”κΈ‰ 차이 λΆ„μ„ν•˜κΈ° 
welfare %>% filter(!is.na(income)) %>% group_by(sex) %>% summarise(mean_income = mean(income)) -> sex_income
## `summarise()` ungrouping output (override with `.groups` argument)
sex_income
## # A tibble: 2 x 2
##   sex    mean_income
##   <chr>        <dbl>
## 1 female        163.
## 2 male          312.
ggplot(data = sex_income, aes(x = sex, y = mean_income)) + geom_col()

λ‚˜μ΄ 및 μ—°λ ΉλŒ€μ— λ”°λ₯Έ μ›”κΈ‰ 차이

#λ‚˜μ΄ λ³€μˆ˜ κ²€ν† 
class(welfare$birth) 
## [1] "numeric"
summary(welfare$birth)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1907    1946    1966    1968    1988    2014
qplot(welfare$birth)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

welfare$age <- 2015 - welfare$birth + 1 #λ‚˜μ΄ νŒŒμƒλ³€μˆ˜ λ§Œλ“€κΈ°
summary(welfare$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00   28.00   50.00   48.43   70.00  109.00
qplot(welfare$age)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#λ‚˜μ΄μ— λ”°λ₯Έ μ›”κΈ‰ 차이 λΆ„μ„ν•˜κΈ° 
welfare %>% filter(!is.na(income)) %>% group_by(age) %>% summarise(mean_income = mean(income)) -> age_income
## `summarise()` ungrouping output (override with `.groups` argument)
age_income
## # A tibble: 69 x 2
##      age mean_income
##    <dbl>       <dbl>
##  1    20        121.
##  2    21        106.
##  3    22        130.
##  4    23        142.
##  5    24        134.
##  6    25        145.
##  7    26        158.
##  8    27        188.
##  9    28        205.
## 10    29        189.
## # ... with 59 more rows
ggplot(data = age_income, aes(x = age, y = mean_income)) + geom_line()

#μ—°λ ΉλŒ€μ— λ”°λ₯Έ μ›”κΈ‰ 차이 λΆ„μ„ν•˜κΈ° 
welfare$ageg <- ifelse(welfare$age < 30, "young", 
                       ifelse(welfare$age <= 59, "middle", "old")) #μ—°λ ΉλŒ€ νŒŒμƒλ³€μˆ˜ λ§Œλ“€κΈ°
table(welfare$ageg)
## 
## middle    old  young 
##   6049   6281   4334
qplot(welfare$ageg)

welfare %>% filter(!is.na(income)) %>% group_by(ageg) %>% summarise(mean_income = mean(income)) -> ageg_income
## `summarise()` ungrouping output (override with `.groups` argument)
ageg_income
## # A tibble: 3 x 2
##   ageg   mean_income
##   <chr>        <dbl>
## 1 middle        282.
## 2 old           125.
## 3 young         164.
ggplot(data = ageg_income, aes(x = ageg, y = mean_income)) + geom_col() + scale_x_discrete(limit = c("young", "middle", "old"))

성별 및 μ—°λ ΉλŒ€, λ‚˜μ΄μ— λ”°λ₯Έ μ›”κΈ‰ 차이

# 성별 및 μ—°λ ΉλŒ€μ— λ”°λ₯Έ μ›”κΈ‰ 차이
welfare %>% filter(!is.na(income)) %>% group_by(ageg, sex) %>% summarise(mean_income = mean(income)) -> sex_income
## `summarise()` regrouping output by 'ageg' (override with `.groups` argument)
sex_income
## # A tibble: 6 x 3
## # Groups:   ageg [3]
##   ageg   sex    mean_income
##   <chr>  <chr>        <dbl>
## 1 middle female       188. 
## 2 middle male         353. 
## 3 old    female        81.5
## 4 old    male         174. 
## 5 young  female       160. 
## 6 young  male         171.
ggplot(data = sex_income, aes(x = ageg, y = mean_income, fill = sex)) + geom_col(position = "dodge") + scale_x_discrete(limit = c("young", "middle", "old"))

# 성별 및 λ‚˜μ΄μ— λ”°λ₯Έ μ›”κΈ‰ 차이
welfare %>% filter(!is.na(income)) %>% group_by(age, sex) %>% summarise(mean_income = mean(income)) -> sex_age
## `summarise()` regrouping output by 'age' (override with `.groups` argument)
sex_income
## # A tibble: 6 x 3
## # Groups:   ageg [3]
##   ageg   sex    mean_income
##   <chr>  <chr>        <dbl>
## 1 middle female       188. 
## 2 middle male         353. 
## 3 old    female        81.5
## 4 old    male         174. 
## 5 young  female       160. 
## 6 young  male         171.
ggplot(data = sex_age, aes(x = age, y = mean_income, col = sex)) + geom_line()

직업별 μ›”κΈ‰ 차이

class(welfare$code_job)
## [1] "numeric"
table(welfare$code_job)
## 
##  111  120  131  132  133  134  135  139  141  149  151  152  153  159  211  212 
##    2   16   10   11    9    3    7   10   35   20   26   18   15   16    8    4 
##  213  221  222  223  224  231  232  233  234  235  236  237  239  241  242  243 
##    3   17   31   12    4   41    5    3    6   48   14    2   29   12    4   63 
##  244  245  246  247  248  251  252  253  254  259  261  271  272  273  274  281 
##    4   33   59   77   38   14  111   24   67  109    4   15   11    4   36   17 
##  283  284  285  286  289  311  312  313  314  320  330  391  392  399  411  412 
##    8   10   26   16    5  140  260  220   84   75   15    4   13   87   47   12 
##  421  422  423  429  431  432  441  442  510  521  522  530  611  612  613  620 
##  124   71    5   14   20   33  154  197  192  353    5  106 1320   11   40    2 
##  630  710  721  722  730  741  742  743  751  752  753  761  762  771  772  773 
##   20   29   30   22   16   27    3   34   34    5   49   69   27   11   61   86 
##  774  780  791  792  799  811  812  819  821  822  823  831  832  841  842  843 
##    7   17    5   21   45   16    1    6    9    9   23    5   17   32   10    4 
##  851  852  853  854  855  861  862  863  864  871  873  874  875  876  881  882 
##   19   13    7   33    9    3   14   17   31    2  257   34   37    2    2    3 
##  891  892  899  910  921  922  930  941  942  951  952  953  991  992  999 1011 
##    8   19   16  102   31   74  289  325   99  125  122   73   45   12  141    2 
## 1012 
##   17
library(readxl)

#데이터 μ „μ²˜λ¦¬
setwd("C:\\Users\\user\\Desktop\\R")
list_job <- read_excel("Koweps_Codebook.xlsx", col_names = T, sheet = 2)
head(list_job)
## # A tibble: 6 x 2
##   code_job job                                
##      <dbl> <chr>                              
## 1      111 μ˜νšŒμ˜μ› κ³ μœ„κ³΅λ¬΄μ› 및 κ³΅κ³΅λ‹¨μ²΄μž„μ›
## 2      112 κΈ°μ—…κ³ μœ„μž„μ›                       
## 3      120 ν–‰μ • 및 κ²½μ˜μ§€μ› κ΄€λ¦¬μž            
## 4      131 연ꡬ ꡐ윑 및 법λ₯  κ΄€λ ¨ κ΄€λ¦¬μž      
## 5      132 λ³΄ν—˜ 및 금육 κ΄€λ¦¬μž                
## 6      133 보건 및 μ‚¬νšŒλ³΅μ§€ κ΄€λ ¨ κ΄€λ¦¬μž
dim(list_job)
## [1] 149   2
welfare <- left_join(welfare, list_job, id = "code_job")
## Joining, by = "code_job"
welfare %>% filter(!is.na(code_job)) %>% select(code_job, job) %>% head(10)
##    code_job                                job
## 1       942                   경비원 및 κ²€ν‘œμ›
## 2       762                             전기곡
## 3       530 λ°©λ¬Έ 노점 및 톡신 판맀 κ΄€λ ¨ μ’…μ‚¬μž
## 4       999        기타 μ„œλΉ„μŠ€κ΄€λ ¨ λ‹¨μˆœ 쒅사원
## 5       312                    κ²½μ˜κ΄€λ ¨ 사무원
## 6       254             문리 기술 및 예λŠ₯ 강사
## 7       510                        μ˜μ—… μ’…μ‚¬μž
## 8       530 λ°©λ¬Έ 노점 및 톡신 판맀 κ΄€λ ¨ μ’…μ‚¬μž
## 9       286   슀포츠 및 λ ˆν¬λ ˆμ΄μ…˜ κ΄€λ ¨ μ „λ¬Έκ°€
## 10      521                   λ§€μž₯ 판맀 μ’…μ‚¬μž
#직업별 μƒμœ„ 10μœ„, ν•˜μœ„ 10μœ„μ˜ μ›”κΈ‰ 평균 κ΅¬ν•˜κΈ° 
job_income <- welfare %>% filter(!is.na(job) & !is.na(income)) %>% group_by(job) %>% summarise(mean_income = mean(income)) %>% arrange(desc(mean_income))
## `summarise()` ungrouping output (override with `.groups` argument)
top10 <- job_income %>% head(10)
bottom10 <- job_income %>% tail(10)
ggplot(data = top10, aes(x = reorder(job, mean_income), y = mean_income)) + geom_col() + coord_flip()

ggplot(data = bottom10, aes(x = reorder(job, -mean_income), y = mean_income)) + geom_col() + coord_flip() + ylim(0, 850)

성별 직업 λΉˆλ„

male_job <- welfare %>% filter(!is.na(job) & sex == "male") %>% group_by(job) %>% summarise(n = n()) %>% arrange(desc(n))
## `summarise()` ungrouping output (override with `.groups` argument)
male_top10 <- male_job %>% head(10)

female_job <- welfare %>% filter(!is.na(job) & sex == "female") %>% group_by(job) %>% summarise(n = n()) %>% arrange(desc(n))
## `summarise()` ungrouping output (override with `.groups` argument)
female_top10 <- female_job %>% head(10)

ggplot(data = male_top10, aes(x = reorder(job, n), y = n)) + geom_col() + coord_flip()

ggplot(data = female_top10, aes(x = reorder(job, n), y = n)) + geom_col() + coord_flip() 

쒅ꡐ μœ λ¬΄μ— λ”°λ₯Έ 이혼율

class(welfare$religion)
## [1] "numeric"
table(welfare$religion)
## 
##    1    2 
## 8047 8617
class(welfare$marriage)
## [1] "numeric"
table(welfare$marriage)
## 
##    0    1    2    3    4    5    6 
## 2861 8431 2117  712   84 2433   26
#데이터 μ „μ²˜λ¦¬
welfare$religion <- ifelse(welfare$religion == 1, "yes", "no")
qplot(welfare$religion)

welfare$group_marriage <- ifelse(welfare$marriage == 1, "marriage", 
                                ifelse(welfare$marriage == 3, "divorce", NA))
table(welfare$group_marriage)
## 
##  divorce marriage 
##      712     8431
qplot(welfare$group_marriage)

religion_marriage <- welfare %>% filter(!is.na(group_marriage)) %>% group_by(religion, group_marriage) %>% summarise(n = n()) %>% mutate(tot_group = sum(n)) %>% mutate(pct = round(n/tot_group*100, 1))
## `summarise()` regrouping output by 'religion' (override with `.groups` argument)
divorce <- religion_marriage %>% filter(group_marriage == "divorce") %>% select(religion, pct) 

#κ·Έλž˜ν”„ 그리기 
ggplot(data = divorce, aes(x = religion, y = pct)) + geom_col()

#μ—°λ ΉλŒ€λ³„ 이혼율 뢄석
ageg_marriage <- welfare %>% filter(!is.na(group_marriage)) %>% group_by(ageg, group_marriage) %>% summarise(n = n()) %>% mutate(tot_group = sum(n)) %>% mutate(pct = round(n/tot_group*100, 1))
## `summarise()` regrouping output by 'ageg' (override with `.groups` argument)
ageg_divorce <- ageg_marriage %>% filter(group_marriage == "divorce") %>% select(ageg, pct)
ggplot(data = ageg_divorce, aes(x = ageg, y = pct)) + geom_col()

#μ—°λ ΉλŒ€ 및 쒅ꡐ μœ λ¬΄μ— λ”°λ₯Έ 이혼율 뢄석
ageg_religion_marriage <- welfare %>% filter(!is.na(group_marriage)) %>% group_by(ageg, religion, group_marriage) %>% summarise(n = n()) %>% mutate(tot_group = sum(n)) %>% mutate(pct = round(n/tot_group*100, 1))
## `summarise()` regrouping output by 'ageg', 'religion' (override with `.groups` argument)
ageg_religion_divorce <- ageg_religion_marriage %>% filter(group_marriage == "divorce") %>% select(ageg, religion, pct)
ggplot(data = ageg_religion_divorce, aes(x = ageg, y = pct, fill = religion)) + geom_col(position = "dodge")

지역별 μ—°λ ΉλŒ€ λΉ„μœ¨

class(welfare$code_region)
## [1] "numeric"
table(welfare$code_region)
## 
##    1    2    3    4    5    6    7 
## 2486 3711 2785 2036 1467 1257 2922
#데이터 μ „μ²˜λ¦¬
list_region <- data.frame(code_region = c(1:7), 
                          region = c("μ„œμšΈ", "μˆ˜λ„κΆŒ(인천/κ²½κΈ°)", "λΆ€μ‚°/경남/μšΈμ‚°", "λŒ€κ΅¬/경뢁", "λŒ€μ „/좩남", "강원/좩뢁", "κ΄‘μ£Ό/전남/전뢁/μ œμ£Όλ„"))
welfare <- left_join(welfare, list_region, by = "code_region")

region_ageg <- welfare %>% group_by(region, ageg) %>% summarise(n = n()) %>% mutate(tot_group = sum(n)) %>% mutate(pct = round(n/tot_group*100, 2))
## `summarise()` regrouping output by 'region' (override with `.groups` argument)
#κ·Έλž˜ν”„ 그리기
ggplot(data = region_ageg, aes(x = region, y = pct, fill = ageg)) + geom_col() + coord_flip()

#λ…Έλ…„μΈ΅ λΉ„μœ¨μ΄ 높은 순으둜 μ •λ ¬ν•˜κΈ°
list_order_old <- region_ageg %>% filter(ageg == "old") %>% arrange(pct) 
order <- list_order_old$region
ggplot(data = region_ageg, aes(x = region, y = pct, fill = ageg)) + geom_col() + coord_flip() + scale_x_discrete(limits = order)

#μ—°λ ΉλŒ€ 순으둜 λ§‰λŒ€ 색깔 λ‚˜μ—΄ν•˜κΈ°
region_ageg$ageg <- factor(region_ageg$ageg, level = c("old", "middle", "young"))
ggplot(data = region_ageg, aes(x = region, y = pct, fill = ageg)) + geom_col() + coord_flip() + scale_x_discrete(limits = order)

Week 12

install.packages("stringr")
install.packages("wordcloud")
library(KoNLP)
## Checking user defined dictionary!
library(stringr)
library(dplyr)
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)

###νž™ν•© 가사 ν…μŠ€νŠΈ λ§ˆμ΄λ‹

setwd("C:\\Users\\user\\Desktop\\R")
txt <- readLines("hiphop.txt")
head(txt)
## [1] "\"보고 μ‹Άλ‹€"                  "μ΄λ ‡κ²Œ λ§ν•˜λ‹ˆκΉŒ 더 보고 μ‹Άλ‹€"
## [3] "λ„ˆν¬ 사진을 보고 μžˆμ–΄λ„"      "보고 μ‹Άλ‹€"                   
## [5] "λ„ˆλ¬΄ μ•Όμ†ν•œ μ‹œκ°„"             "λ‚˜λŠ” μš°λ¦¬κ°€ λ°‰λ‹€"
txt <- str_replace_all(txt, "\\W", " ") # gsub(" ", "\\W", txt)둜 λŒλ €λ„ 될 것 κ°™μŒ
extractNoun("λŒ€ν•œλ―Όκ΅­μ˜ μ˜ν† λŠ” ν•œλ°˜λ„μ™€ κ·Έ λΆ€μ†λ„μ„œλ‘œ ν•œλ‹€")
## [1] "λŒ€ν•œ"     "λ―Όκ΅­"     "μ˜ν† "     "ν•œλ°˜λ„μ™€" "λΆ€μ†λ„μ„œ" "ν•œ"
#κ°€μž₯ 많이 μ‚¬μš©λœ 단어 μ•Œμ•„λ³΄κΈ° 
nouns <- extractNoun(txt) # κ°€μ‚¬μ—μ„œ λͺ…사 μΆ”μΆœ, strsplit와 λΉ„μŠ·ν•œλ° λͺ…μ‚¬λ§Œ 뽑아진 것 κ°™λ‹€.  
wordcount <- table(unlist(nouns)) 
df_word <- as.data.frame(wordcount, stringsAsFactors = F)
df_word <- rename(df_word, word = Var1, freq = Freq)
df_word <- filter(df_word, nchar(word) >= 2)
top_20 <- df_word %>% arrange(desc(freq)) %>% head(20)

#μ›Œλ“œ ν΄λΌμš°λ“œ λ§Œλ“€κΈ°
pal <- brewer.pal(8, "Dark2")
set.seed(1234)
wordcloud(words = df_word$word,
          freq = df_word$freq, 
          min.freq = 2, 
          max.words = 200, 
          random.order = F, 
          rot.per = .1, 
          scale = c(4, 0.3), 
          colors = pal)

###ꡭ정원 νŠΈμœ— ν…μŠ€νŠΈ λ§ˆμ΄λ‹

#데이터 μ •μ œν•˜κΈ°
setwd("C:\\Users\\user\\Desktop\\R")
twitter <- read.csv("twitter.csv", header = T, stringsAsFactors = F, fileEncoding = "UTF-8")
twitter <- rename(twitter, no = 번호, id = 계정이름, date = μž‘μ„±μΌ, tw = λ‚΄μš©)
twitter$tw <- str_replace_all(twitter$tw, "\\W", " ")

#κ°€μž₯ 많이 μ‚¬μš©λœ 단어 μ•Œμ•„λ³΄κΈ° 
nouns <- extractNoun(twitter$tw)
wordcount <- table(unlist(nouns)) 
df_word <- as.data.frame(wordcount, stringsAsFactors = F)
df_word <- rename(df_word, word = Var1, freq = Freq)
df_word <- filter(df_word, nchar(word) >= 2)
top_20 <- df_word %>% arrange(desc(freq)) %>% head(20)

#단어 λΉˆλ„ κ·Έλž˜ν”„ λ§Œλ“€κΈ°
order <- arrange(top_20, freq)$word
ggplot(data = top_20, aes(x = word, y = freq)) + ylim(0, 2500) + geom_col() + coord_flip() + scale_x_discrete(limit = order) + geom_text(aes(label = freq), hjust = -0.3)

#wordcloud λ§Œλ“€κΈ°
pal <- brewer.pal(8, "Dark2")
set.seed(1234)
wordcloud(words = df_word$word,
          freq = df_word$freq, 
          min.freq = 10, 
          max.words = 200, 
          random.order = F, 
          rot.per = .1, 
          scale = c(6, 0.2), 
          colors = pal)

pal <- brewer.pal(9, "Blues")[5:9]
set.seed(1234)
wordcloud(words = df_word$word,
          freq = df_word$freq, 
          min.freq = 10, 
          max.words = 200, 
          random.order = F, 
          rot.per = .1, 
          scale = c(6, 0.2), 
          colors = pal)