a <- 1
b <- 2
c <- 3
d <- 3.5
var1 <- c(1, 2, 5, 7, 8)
var2 <- c(1:5)
var3 <- seq(1,10, by = 2)
var4 <- seq(10,100, by = 7)
class <- c("μ¬κ³Ό", "λ°°", "μ€λ μ§", "μλ", "κ·€", "λ°λλ")
str4 <- c("a", "b", "c")
str5 <- c("Hello", "world", "is", "good!")
paste(str5, collapse = " ")
## [1] "Hello world is good!"
install.packages("ggplot2")
library(ggplot2)
x <- c("a", "a", "b", "c")
qplot(x)
qplot(data = mpg, x = hwy) #hwy: μ°λΉ
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
qplot(data = mpg, x = drv, y = hwy, geom = "boxplot", col=drv)
qplot(mpg, wt, data = mtcars)
qplot(mpg, wt, data = mtcars, colour = cyl)
qplot(mpg, wt, data = mtcars, size = cyl)
qplot(mpg, wt, data = mtcars, facets = vs ~ am)
## Week 3
english <- c(90, 80, 70, 60)
math <- c(50, 60, 100, 20)
class<- c(1, 1, 2, 2)
df_midterm <- data.frame(english = c(90, 80, 70, 60),
math = c(50, 60, 100, 20),
class = c(1, 1, 2, 2))
mean(df_midterm$english)
## [1] 75
mean(df_midterm$math)
## [1] 57.5
df_fruit<- data.frame(goods = c("μ¬κ³Ό", "λΈκΈ°", "μλ°"),
price = c(1800, 1500, 3000),
sell = c(24, 38, 13))
mean(df_fruit$price)
## [1] 2100
mean(df_fruit$sell)
## [1] 25
install.packages("readxl")
library(readxl)
setwd("C:\\Users\\user\\Desktop\\R")
df_exam <- read_excel("excel_exam.xlsx")
mean(df_exam$math)
## [1] 57.45
mean(df_exam$english)
## [1] 84.9
mean(df_exam$science)
## [1] 59.45
df_exam_novar <- read_excel("excel_exam_novar.xlsx", col_names = F) #첫λ²μ§Έ νμ΄ λ³μλͺ
μ΄ μλ κ²½μ°
## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5
df_exam_sheet <- read_excel("excel_exam_sheet.xlsx", sheet = 3) #μνΈκ° μ¬λ¬ κ°μΌ κ²½μ°
setwd("C:\\Users\\user\\Desktop\\R")
df_csv_exam <- read.csv("csv_exam.csv", stringsAsFactors = F)
str(df_csv_exam)
## 'data.frame': 20 obs. of 5 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ class : int 1 1 1 1 2 2 2 2 3 3 ...
## $ math : int 50 60 45 30 25 50 80 90 20 50 ...
## $ english: int 98 97 86 98 80 89 90 78 98 98 ...
## $ science: int 50 60 78 58 65 98 45 25 15 45 ...
write.csv(df_midterm, file = "df_midterm.csv")
saveRDS(df_midterm, file = "df_midterm.rds")
rm(df_midterm)
readRDS("df_midterm.rds") #Rμμ μ§μλ 볡ꡬ κ°λ₯
## english math class
## 1 90 50 1
## 2 80 60 1
## 3 70 100 2
## 4 60 20 2
mpg<- as.data.frame(ggplot2::mpg)
dim(mpg)
## [1] 234 11
head(mpg, 5)
## manufacturer model displ year cyl trans drv cty hwy fl class
## 1 audi a4 1.8 1999 4 auto(l5) f 18 29 p compact
## 2 audi a4 1.8 1999 4 manual(m5) f 21 29 p compact
## 3 audi a4 2.0 2008 4 manual(m6) f 20 31 p compact
## 4 audi a4 2.0 2008 4 auto(av) f 21 30 p compact
## 5 audi a4 2.8 1999 6 auto(l5) f 16 26 p compact
tail(mpg, 5)
## manufacturer model displ year cyl trans drv cty hwy fl class
## 230 volkswagen passat 2.0 2008 4 auto(s6) f 19 28 p midsize
## 231 volkswagen passat 2.0 2008 4 manual(m6) f 21 29 p midsize
## 232 volkswagen passat 2.8 1999 6 auto(l5) f 16 26 p midsize
## 233 volkswagen passat 2.8 1999 6 manual(m5) f 18 26 p midsize
## 234 volkswagen passat 3.6 2008 6 auto(s6) f 17 26 p midsize
str(mpg)
## 'data.frame': 234 obs. of 11 variables:
## $ manufacturer: chr "audi" "audi" "audi" "audi" ...
## $ model : chr "a4" "a4" "a4" "a4" ...
## $ displ : num 1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
## $ year : int 1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
## $ cyl : int 4 4 4 4 6 6 6 4 4 4 ...
## $ trans : chr "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
## $ drv : chr "f" "f" "f" "f" ...
## $ cty : int 18 21 20 21 16 18 18 18 16 20 ...
## $ hwy : int 29 29 31 30 26 26 27 26 25 28 ...
## $ fl : chr "p" "p" "p" "p" ...
## $ class : chr "compact" "compact" "compact" "compact" ...
summary(mpg)
## manufacturer model displ year
## Length:234 Length:234 Min. :1.600 Min. :1999
## Class :character Class :character 1st Qu.:2.400 1st Qu.:1999
## Mode :character Mode :character Median :3.300 Median :2004
## Mean :3.472 Mean :2004
## 3rd Qu.:4.600 3rd Qu.:2008
## Max. :7.000 Max. :2008
## cyl trans drv cty
## Min. :4.000 Length:234 Length:234 Min. : 9.00
## 1st Qu.:4.000 Class :character Class :character 1st Qu.:14.00
## Median :6.000 Mode :character Mode :character Median :17.00
## Mean :5.889 Mean :16.86
## 3rd Qu.:8.000 3rd Qu.:19.00
## Max. :8.000 Max. :35.00
## hwy fl class
## Min. :12.00 Length:234 Length:234
## 1st Qu.:18.00 Class :character Class :character
## Median :24.00 Mode :character Mode :character
## Mean :23.44
## 3rd Qu.:27.00
## Max. :44.00
install.packages("dplyr")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
mpg.new <- mpg
mpg.new <- rename(mpg.new, city = cty, highway = hwy)
head(mpg.new, 5)
## manufacturer model displ year cyl trans drv city highway fl class
## 1 audi a4 1.8 1999 4 auto(l5) f 18 29 p compact
## 2 audi a4 1.8 1999 4 manual(m5) f 21 29 p compact
## 3 audi a4 2.0 2008 4 manual(m6) f 20 31 p compact
## 4 audi a4 2.0 2008 4 auto(av) f 21 30 p compact
## 5 audi a4 2.8 1999 6 auto(l5) f 16 26 p compact
mpg.new$total <- (mpg.new$highway + mpg.new$city)/2
mpg.new$test2 <- ifelse(mpg.new$total >= 30, "A",
ifelse(mpg.new$total >= 25, "B",
ifelse(mpg.new$total >= 20, "C", "D")))
table(mpg.new$test2)
##
## A B C D
## 10 33 85 106
table(mpg.new$test2, mpg.new$year)
##
## 1999 2008
## A 5 5
## B 14 19
## C 46 39
## D 52 54
qplot(mpg.new$test2)
setwd("C:\\Users\\user\\Desktop\\R")
exam <- read.csv("csv_exam.csv")
exam %>% filter(class == 1) #filter: extracting row, %>%: νμ΄ν, ctrl+shift+M
## id class math english science
## 1 1 1 50 98 50
## 2 2 1 60 97 60
## 3 3 1 45 86 78
## 4 4 1 30 98 58
exam %>% select(id, math) %>% head(10) #select: extracting column
## id math
## 1 1 50
## 2 2 60
## 3 3 45
## 4 4 30
## 5 5 25
## 6 6 50
## 7 7 80
## 8 8 90
## 9 9 20
## 10 10 50
exam %>% arrange(math) %>% head(3)
## id class math english science
## 1 9 3 20 98 15
## 2 5 2 25 80 65
## 3 4 1 30 98 58
exam %>% mutate(total = math + english + science) %>% head(5)
## id class math english science total
## 1 1 1 50 98 50 198
## 2 2 1 60 97 60 217
## 3 3 1 45 86 78 209
## 4 4 1 30 98 58 186
## 5 5 2 25 80 65 170
test1 <- data.frame(id = c(1, 2, 3, 4, 5),
midterm = c(60, 80, 70, 90, 85))
test2 <- data.frame(id = c(1, 2, 3, 4, 5),
final = c(70, 83, 65, 95, 80))
total <- left_join(test1, test2, by = "id")
total
## id midterm final
## 1 1 60 70
## 2 2 80 83
## 3 3 70 65
## 4 4 90 95
## 5 5 85 80
group_a <- data.frame(id = c(1, 2, 3, 4, 5),
test = c(60, 80, 70, 90, 85))
group_b <- data.frame(id = c(6, 7, 8, 9, 10),
test = c(70, 83, 65, 95, 80))
group_all <- bind_rows(group_a, group_b)
group_all
## id test
## 1 1 60
## 2 2 80
## 3 3 70
## 4 4 90
## 5 5 85
## 6 6 70
## 7 7 83
## 8 8 65
## 9 9 95
## 10 10 80
df <- data.frame(sex = c("M", "F", NA, "M", "F"),
score = c(5, 4, 3, 4, NA))
df
## sex score
## 1 M 5
## 2 F 4
## 3 <NA> 3
## 4 M 4
## 5 F NA
is.na(df)
## sex score
## [1,] FALSE FALSE
## [2,] FALSE FALSE
## [3,] TRUE FALSE
## [4,] FALSE FALSE
## [5,] FALSE TRUE
table(is.na(df))
##
## FALSE TRUE
## 8 2
table(is.na(df$sex))
##
## FALSE TRUE
## 4 1
table(is.na(df$score))
##
## FALSE TRUE
## 4 1
df_nomiss <- df %>% filter(!is.na(score) & !is.na(sex))
df_nomiss
## sex score
## 1 M 5
## 2 F 4
## 3 M 4
df_nomiss2 <- na.omit(df)
df_nomiss2
## sex score
## 1 M 5
## 2 F 4
## 4 M 4
setwd("C:\\Users\\user\\Desktop\\R")
exam <- read.csv("csv_exam.csv")
exam[c(3, 8, 15), "math"] <- NA
exam %>% summarise(mean_math = mean(math, na.rm = T))
## mean_math
## 1 55.23529
exam$math <- ifelse(is.na(exam$math), 55, exam$math)
table(is.na(exam$math))
##
## FALSE
## 20
boxplot(mpg$hwy)$stats
## [,1]
## [1,] 12
## [2,] 18
## [3,] 24
## [4,] 27
## [5,] 37
## attr(,"class")
## 1
## "integer"
mpg$hwy <- ifelse(mpg$hwy < 12 | mpg$hwy > 37, NA, mpg$hwy)
table(is.na(mpg$hwy))
##
## FALSE TRUE
## 231 3
mpg %>% group_by(drv) %>% summarise(mean_hwy = mean(hwy, na.rm = T))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 3 x 2
## drv mean_hwy
## <chr> <dbl>
## 1 4 19.2
## 2 f 27.7
## 3 r 21
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point()
## Warning: Removed 3 rows containing missing values (geom_point).
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + xlim(3, 6) + ylim(10, 30)
## Warning: Removed 105 rows containing missing values (geom_point).
df_mpg <- mpg %>% group_by(drv) %>% summarise(mean_hwy = mean(hwy, na.rm = T))
## `summarise()` ungrouping output (override with `.groups` argument)
df_mpg
## # A tibble: 3 x 2
## drv mean_hwy
## <chr> <dbl>
## 1 4 19.2
## 2 f 27.7
## 3 r 21
ggplot(data = df_mpg, aes(x = drv, y = mean_hwy)) + geom_col()
ggplot(data = df_mpg, aes(x = reorder(drv, -mean_hwy), y = mean_hwy)) + geom_col()
ggplot(data = mpg, aes(x = drv)) + geom_bar()
ggplot(data = mpg, aes(x = hwy)) + geom_bar()
## Warning: Removed 3 rows containing non-finite values (stat_count).
ggplot(data = economics, aes(x = date, y = unemploy)) + geom_line()
install.packages("foreign")
library(foreign)
library(dplyr)
library(ggplot2)
library(readxl)
setwd("C:\\Users\\user\\Desktop\\R")
raw_welfare <- read.spss(file = "Koweps_hpc10_2015_beta1.sav", to.data.frame = T)
## Warning in read.spss(file = "Koweps_hpc10_2015_beta1.sav", to.data.frame = T):
## Koweps_hpc10_2015_beta1.sav: Compression bias (0) is not the usual value of 100
welfare <- raw_welfare
dim(welfare)
## [1] 16664 957
welfare <- rename(welfare,
sex = h10_g3, #μ±λ³
birth = h10_g4, #νμ΄λ μ°λ
marriage = h10_g10, #νΌμΈ μν
religion = h10_g11, #μ’
κ΅
income = p1002_8aq1, #μκΈ
code_job = h10_eco9, #μ§μ
μ½λ
code_region = h10_reg7) #μ§μ μ½λ
#μ±λ³ λ³μ κ²ν
class(welfare$sex)
## [1] "numeric"
table(welfare$sex)
##
## 1 2
## 7578 9086
ifelse(welfare$sex == 1, "male", "female") -> welfare$sex
qplot(welfare$sex)
#μκΈ λ³μ κ²ν
class(welfare$income)
## [1] "numeric"
summary(welfare$income)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0 122.0 192.5 241.6 316.6 2400.0 12030
qplot(welfare$income) + xlim(0, 1000)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 12051 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
#κ²°μΈ‘μΉ μ κ±°νκΈ°
ifelse(welfare$income %in% c(0, 9999), NA, welfare$income) -> welfare$income
table(is.na(welfare$income))
##
## FALSE TRUE
## 4620 12044
#μ±λ³μ λ°λ₯Έ μκΈ μ°¨μ΄ λΆμνκΈ°
welfare %>% filter(!is.na(income)) %>% group_by(sex) %>% summarise(mean_income = mean(income)) -> sex_income
## `summarise()` ungrouping output (override with `.groups` argument)
sex_income
## # A tibble: 2 x 2
## sex mean_income
## <chr> <dbl>
## 1 female 163.
## 2 male 312.
ggplot(data = sex_income, aes(x = sex, y = mean_income)) + geom_col()
#λμ΄ λ³μ κ²ν
class(welfare$birth)
## [1] "numeric"
summary(welfare$birth)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1907 1946 1966 1968 1988 2014
qplot(welfare$birth)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
welfare$age <- 2015 - welfare$birth + 1 #λμ΄ νμλ³μ λ§λ€κΈ°
summary(welfare$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 28.00 50.00 48.43 70.00 109.00
qplot(welfare$age)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#λμ΄μ λ°λ₯Έ μκΈ μ°¨μ΄ λΆμνκΈ°
welfare %>% filter(!is.na(income)) %>% group_by(age) %>% summarise(mean_income = mean(income)) -> age_income
## `summarise()` ungrouping output (override with `.groups` argument)
age_income
## # A tibble: 69 x 2
## age mean_income
## <dbl> <dbl>
## 1 20 121.
## 2 21 106.
## 3 22 130.
## 4 23 142.
## 5 24 134.
## 6 25 145.
## 7 26 158.
## 8 27 188.
## 9 28 205.
## 10 29 189.
## # ... with 59 more rows
ggplot(data = age_income, aes(x = age, y = mean_income)) + geom_line()
#μ°λ Ήλμ λ°λ₯Έ μκΈ μ°¨μ΄ λΆμνκΈ°
welfare$ageg <- ifelse(welfare$age < 30, "young",
ifelse(welfare$age <= 59, "middle", "old")) #μ°λ Ήλ νμλ³μ λ§λ€κΈ°
table(welfare$ageg)
##
## middle old young
## 6049 6281 4334
qplot(welfare$ageg)
welfare %>% filter(!is.na(income)) %>% group_by(ageg) %>% summarise(mean_income = mean(income)) -> ageg_income
## `summarise()` ungrouping output (override with `.groups` argument)
ageg_income
## # A tibble: 3 x 2
## ageg mean_income
## <chr> <dbl>
## 1 middle 282.
## 2 old 125.
## 3 young 164.
ggplot(data = ageg_income, aes(x = ageg, y = mean_income)) + geom_col() + scale_x_discrete(limit = c("young", "middle", "old"))
# μ±λ³ λ° μ°λ Ήλμ λ°λ₯Έ μκΈ μ°¨μ΄
welfare %>% filter(!is.na(income)) %>% group_by(ageg, sex) %>% summarise(mean_income = mean(income)) -> sex_income
## `summarise()` regrouping output by 'ageg' (override with `.groups` argument)
sex_income
## # A tibble: 6 x 3
## # Groups: ageg [3]
## ageg sex mean_income
## <chr> <chr> <dbl>
## 1 middle female 188.
## 2 middle male 353.
## 3 old female 81.5
## 4 old male 174.
## 5 young female 160.
## 6 young male 171.
ggplot(data = sex_income, aes(x = ageg, y = mean_income, fill = sex)) + geom_col(position = "dodge") + scale_x_discrete(limit = c("young", "middle", "old"))
# μ±λ³ λ° λμ΄μ λ°λ₯Έ μκΈ μ°¨μ΄
welfare %>% filter(!is.na(income)) %>% group_by(age, sex) %>% summarise(mean_income = mean(income)) -> sex_age
## `summarise()` regrouping output by 'age' (override with `.groups` argument)
sex_income
## # A tibble: 6 x 3
## # Groups: ageg [3]
## ageg sex mean_income
## <chr> <chr> <dbl>
## 1 middle female 188.
## 2 middle male 353.
## 3 old female 81.5
## 4 old male 174.
## 5 young female 160.
## 6 young male 171.
ggplot(data = sex_age, aes(x = age, y = mean_income, col = sex)) + geom_line()
class(welfare$code_job)
## [1] "numeric"
table(welfare$code_job)
##
## 111 120 131 132 133 134 135 139 141 149 151 152 153 159 211 212
## 2 16 10 11 9 3 7 10 35 20 26 18 15 16 8 4
## 213 221 222 223 224 231 232 233 234 235 236 237 239 241 242 243
## 3 17 31 12 4 41 5 3 6 48 14 2 29 12 4 63
## 244 245 246 247 248 251 252 253 254 259 261 271 272 273 274 281
## 4 33 59 77 38 14 111 24 67 109 4 15 11 4 36 17
## 283 284 285 286 289 311 312 313 314 320 330 391 392 399 411 412
## 8 10 26 16 5 140 260 220 84 75 15 4 13 87 47 12
## 421 422 423 429 431 432 441 442 510 521 522 530 611 612 613 620
## 124 71 5 14 20 33 154 197 192 353 5 106 1320 11 40 2
## 630 710 721 722 730 741 742 743 751 752 753 761 762 771 772 773
## 20 29 30 22 16 27 3 34 34 5 49 69 27 11 61 86
## 774 780 791 792 799 811 812 819 821 822 823 831 832 841 842 843
## 7 17 5 21 45 16 1 6 9 9 23 5 17 32 10 4
## 851 852 853 854 855 861 862 863 864 871 873 874 875 876 881 882
## 19 13 7 33 9 3 14 17 31 2 257 34 37 2 2 3
## 891 892 899 910 921 922 930 941 942 951 952 953 991 992 999 1011
## 8 19 16 102 31 74 289 325 99 125 122 73 45 12 141 2
## 1012
## 17
library(readxl)
#λ°μ΄ν° μ μ²λ¦¬
setwd("C:\\Users\\user\\Desktop\\R")
list_job <- read_excel("Koweps_Codebook.xlsx", col_names = T, sheet = 2)
head(list_job)
## # A tibble: 6 x 2
## code_job job
## <dbl> <chr>
## 1 111 μνμμ κ³ μ곡무μ λ° κ³΅κ³΅λ¨μ²΄μμ
## 2 112 κΈ°μ
κ³ μμμ
## 3 120 νμ λ° κ²½μμ§μ κ΄λ¦¬μ
## 4 131 μ°κ΅¬ κ΅μ‘ λ° λ²λ₯ κ΄λ ¨ κ΄λ¦¬μ
## 5 132 보ν λ° κΈμ΅ κ΄λ¦¬μ
## 6 133 보건 λ° μ¬νλ³΅μ§ κ΄λ ¨ κ΄λ¦¬μ
dim(list_job)
## [1] 149 2
welfare <- left_join(welfare, list_job, id = "code_job")
## Joining, by = "code_job"
welfare %>% filter(!is.na(code_job)) %>% select(code_job, job) %>% head(10)
## code_job job
## 1 942 κ²½λΉμ λ° κ²νμ
## 2 762 μ 기곡
## 3 530 λ°©λ¬Έ λ
Έμ λ° ν΅μ νλ§€ κ΄λ ¨ μ’
μ¬μ
## 4 999 κΈ°ν μλΉμ€κ΄λ ¨ λ¨μ μ’
μ¬μ
## 5 312 κ²½μκ΄λ ¨ μ¬λ¬΄μ
## 6 254 문리 κΈ°μ λ° μλ₯ κ°μ¬
## 7 510 μμ
μ’
μ¬μ
## 8 530 λ°©λ¬Έ λ
Έμ λ° ν΅μ νλ§€ κ΄λ ¨ μ’
μ¬μ
## 9 286 μ€ν¬μΈ λ° λ ν¬λ μ΄μ
κ΄λ ¨ μ λ¬Έκ°
## 10 521 λ§€μ₯ νλ§€ μ’
μ¬μ
#μ§μ
λ³ μμ 10μ, νμ 10μμ μκΈ νκ· κ΅¬νκΈ°
job_income <- welfare %>% filter(!is.na(job) & !is.na(income)) %>% group_by(job) %>% summarise(mean_income = mean(income)) %>% arrange(desc(mean_income))
## `summarise()` ungrouping output (override with `.groups` argument)
top10 <- job_income %>% head(10)
bottom10 <- job_income %>% tail(10)
ggplot(data = top10, aes(x = reorder(job, mean_income), y = mean_income)) + geom_col() + coord_flip()
ggplot(data = bottom10, aes(x = reorder(job, -mean_income), y = mean_income)) + geom_col() + coord_flip() + ylim(0, 850)
male_job <- welfare %>% filter(!is.na(job) & sex == "male") %>% group_by(job) %>% summarise(n = n()) %>% arrange(desc(n))
## `summarise()` ungrouping output (override with `.groups` argument)
male_top10 <- male_job %>% head(10)
female_job <- welfare %>% filter(!is.na(job) & sex == "female") %>% group_by(job) %>% summarise(n = n()) %>% arrange(desc(n))
## `summarise()` ungrouping output (override with `.groups` argument)
female_top10 <- female_job %>% head(10)
ggplot(data = male_top10, aes(x = reorder(job, n), y = n)) + geom_col() + coord_flip()
ggplot(data = female_top10, aes(x = reorder(job, n), y = n)) + geom_col() + coord_flip()
class(welfare$religion)
## [1] "numeric"
table(welfare$religion)
##
## 1 2
## 8047 8617
class(welfare$marriage)
## [1] "numeric"
table(welfare$marriage)
##
## 0 1 2 3 4 5 6
## 2861 8431 2117 712 84 2433 26
#λ°μ΄ν° μ μ²λ¦¬
welfare$religion <- ifelse(welfare$religion == 1, "yes", "no")
qplot(welfare$religion)
welfare$group_marriage <- ifelse(welfare$marriage == 1, "marriage",
ifelse(welfare$marriage == 3, "divorce", NA))
table(welfare$group_marriage)
##
## divorce marriage
## 712 8431
qplot(welfare$group_marriage)
religion_marriage <- welfare %>% filter(!is.na(group_marriage)) %>% group_by(religion, group_marriage) %>% summarise(n = n()) %>% mutate(tot_group = sum(n)) %>% mutate(pct = round(n/tot_group*100, 1))
## `summarise()` regrouping output by 'religion' (override with `.groups` argument)
divorce <- religion_marriage %>% filter(group_marriage == "divorce") %>% select(religion, pct)
#κ·Έλν 그리기
ggplot(data = divorce, aes(x = religion, y = pct)) + geom_col()
#μ°λ Ήλλ³ μ΄νΌμ¨ λΆμ
ageg_marriage <- welfare %>% filter(!is.na(group_marriage)) %>% group_by(ageg, group_marriage) %>% summarise(n = n()) %>% mutate(tot_group = sum(n)) %>% mutate(pct = round(n/tot_group*100, 1))
## `summarise()` regrouping output by 'ageg' (override with `.groups` argument)
ageg_divorce <- ageg_marriage %>% filter(group_marriage == "divorce") %>% select(ageg, pct)
ggplot(data = ageg_divorce, aes(x = ageg, y = pct)) + geom_col()
#μ°λ Ήλ λ° μ’
κ΅ μ 무μ λ°λ₯Έ μ΄νΌμ¨ λΆμ
ageg_religion_marriage <- welfare %>% filter(!is.na(group_marriage)) %>% group_by(ageg, religion, group_marriage) %>% summarise(n = n()) %>% mutate(tot_group = sum(n)) %>% mutate(pct = round(n/tot_group*100, 1))
## `summarise()` regrouping output by 'ageg', 'religion' (override with `.groups` argument)
ageg_religion_divorce <- ageg_religion_marriage %>% filter(group_marriage == "divorce") %>% select(ageg, religion, pct)
ggplot(data = ageg_religion_divorce, aes(x = ageg, y = pct, fill = religion)) + geom_col(position = "dodge")
class(welfare$code_region)
## [1] "numeric"
table(welfare$code_region)
##
## 1 2 3 4 5 6 7
## 2486 3711 2785 2036 1467 1257 2922
#λ°μ΄ν° μ μ²λ¦¬
list_region <- data.frame(code_region = c(1:7),
region = c("μμΈ", "μλκΆ(μΈμ²/κ²½κΈ°)", "λΆμ°/κ²½λ¨/μΈμ°", "λꡬ/κ²½λΆ", "λμ /μΆ©λ¨", "κ°μ/μΆ©λΆ", "κ΄μ£Ό/μ λ¨/μ λΆ/μ μ£Όλ"))
welfare <- left_join(welfare, list_region, by = "code_region")
region_ageg <- welfare %>% group_by(region, ageg) %>% summarise(n = n()) %>% mutate(tot_group = sum(n)) %>% mutate(pct = round(n/tot_group*100, 2))
## `summarise()` regrouping output by 'region' (override with `.groups` argument)
#κ·Έλν 그리기
ggplot(data = region_ageg, aes(x = region, y = pct, fill = ageg)) + geom_col() + coord_flip()
#λ
Έλ
μΈ΅ λΉμ¨μ΄ λμ μμΌλ‘ μ λ ¬νκΈ°
list_order_old <- region_ageg %>% filter(ageg == "old") %>% arrange(pct)
order <- list_order_old$region
ggplot(data = region_ageg, aes(x = region, y = pct, fill = ageg)) + geom_col() + coord_flip() + scale_x_discrete(limits = order)
#μ°λ Ήλ μμΌλ‘ λ§λ μκΉ λμ΄νκΈ°
region_ageg$ageg <- factor(region_ageg$ageg, level = c("old", "middle", "young"))
ggplot(data = region_ageg, aes(x = region, y = pct, fill = ageg)) + geom_col() + coord_flip() + scale_x_discrete(limits = order)
install.packages("stringr")
install.packages("wordcloud")
library(KoNLP)
## Checking user defined dictionary!
library(stringr)
library(dplyr)
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
###νν© κ°μ¬ ν μ€νΈ λ§μ΄λ
setwd("C:\\Users\\user\\Desktop\\R")
txt <- readLines("hiphop.txt")
head(txt)
## [1] "\"λ³΄κ³ μΆλ€" "μ΄λ κ² λ§νλκΉ λ λ³΄κ³ μΆλ€"
## [3] "λν¬ μ¬μ§μ λ³΄κ³ μμ΄λ" "λ³΄κ³ μΆλ€"
## [5] "λ무 μΌμν μκ°" "λλ μ°λ¦¬κ° λ°λ€"
txt <- str_replace_all(txt, "\\W", " ") # gsub(" ", "\\W", txt)λ‘ λλ €λ λ κ² κ°μ
extractNoun("λνλ―Όκ΅μ μν λ νλ°λμ κ·Έ λΆμλμλ‘ νλ€")
## [1] "λν" "λ―Όκ΅" "μν " "νλ°λμ" "λΆμλμ" "ν"
#κ°μ₯ λ§μ΄ μ¬μ©λ λ¨μ΄ μμ보기
nouns <- extractNoun(txt) # κ°μ¬μμ λͺ
μ¬ μΆμΆ, strsplitμ λΉμ·νλ° λͺ
μ¬λ§ λ½μμ§ κ² κ°λ€.
wordcount <- table(unlist(nouns))
df_word <- as.data.frame(wordcount, stringsAsFactors = F)
df_word <- rename(df_word, word = Var1, freq = Freq)
df_word <- filter(df_word, nchar(word) >= 2)
top_20 <- df_word %>% arrange(desc(freq)) %>% head(20)
#μλ ν΄λΌμ°λ λ§λ€κΈ°
pal <- brewer.pal(8, "Dark2")
set.seed(1234)
wordcloud(words = df_word$word,
freq = df_word$freq,
min.freq = 2,
max.words = 200,
random.order = F,
rot.per = .1,
scale = c(4, 0.3),
colors = pal)
###κ΅μ μ νΈμ ν μ€νΈ λ§μ΄λ
#λ°μ΄ν° μ μ νκΈ°
setwd("C:\\Users\\user\\Desktop\\R")
twitter <- read.csv("twitter.csv", header = T, stringsAsFactors = F, fileEncoding = "UTF-8")
twitter <- rename(twitter, no = λ²νΈ, id = κ³μ μ΄λ¦, date = μμ±μΌ, tw = λ΄μ©)
twitter$tw <- str_replace_all(twitter$tw, "\\W", " ")
#κ°μ₯ λ§μ΄ μ¬μ©λ λ¨μ΄ μμ보기
nouns <- extractNoun(twitter$tw)
wordcount <- table(unlist(nouns))
df_word <- as.data.frame(wordcount, stringsAsFactors = F)
df_word <- rename(df_word, word = Var1, freq = Freq)
df_word <- filter(df_word, nchar(word) >= 2)
top_20 <- df_word %>% arrange(desc(freq)) %>% head(20)
#λ¨μ΄ λΉλ κ·Έλν λ§λ€κΈ°
order <- arrange(top_20, freq)$word
ggplot(data = top_20, aes(x = word, y = freq)) + ylim(0, 2500) + geom_col() + coord_flip() + scale_x_discrete(limit = order) + geom_text(aes(label = freq), hjust = -0.3)
#wordcloud λ§λ€κΈ°
pal <- brewer.pal(8, "Dark2")
set.seed(1234)
wordcloud(words = df_word$word,
freq = df_word$freq,
min.freq = 10,
max.words = 200,
random.order = F,
rot.per = .1,
scale = c(6, 0.2),
colors = pal)
pal <- brewer.pal(9, "Blues")[5:9]
set.seed(1234)
wordcloud(words = df_word$word,
freq = df_word$freq,
min.freq = 10,
max.words = 200,
random.order = F,
rot.per = .1,
scale = c(6, 0.2),
colors = pal)