pampas <- c(283, 288, 205, 204, 287, 300, 310)
milk <- c(33, 31, 31, 32, 33, 34, 29)
tissue <- c(2500, 2450, 2490, 2750, 2800, 2350, 2450)
plot(NULL, NULL, xlim = c(1,10), ylim = c(10, 3000)) # 빈 플롯 만들기
lines(pampas, type = "b", col = "blue")
lines(milk, type = "b", col = "red")
lines(tissue, type = "b", col = "black")
scale(pampas)
## [,1]
## [1,] 0.3344691
## [2,] 0.4470309
## [3,] -1.4214938
## [4,] -1.4440061
## [5,] 0.4245185
## [6,] 0.7171790
## [7,] 0.9423024
## attr(,"scaled:center")
## [1] 268.1429
## attr(,"scaled:scale")
## [1] 44.42007
sc <- scale(iris[1:4]) # df[3] = df[, 3] 열로 간주함
head(sc)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,] -0.8976739 1.01560199 -1.335752 -1.311052
## [2,] -1.1392005 -0.13153881 -1.335752 -1.311052
## [3,] -1.3807271 0.32731751 -1.392399 -1.311052
## [4,] -1.5014904 0.09788935 -1.279104 -1.311052
## [5,] -1.0184372 1.24503015 -1.335752 -1.311052
## [6,] -0.5353840 1.93331463 -1.165809 -1.048667
df <- as.data.frame(sc)
head(df)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 -0.8976739 1.01560199 -1.335752 -1.311052
## 2 -1.1392005 -0.13153881 -1.335752 -1.311052
## 3 -1.3807271 0.32731751 -1.392399 -1.311052
## 4 -1.5014904 0.09788935 -1.279104 -1.311052
## 5 -1.0184372 1.24503015 -1.335752 -1.311052
## 6 -0.5353840 1.93331463 -1.165809 -1.048667
cb <- cbind(df, iris$Species)
head(cb)
## Sepal.Length Sepal.Width Petal.Length Petal.Width iris$Species
## 1 -0.8976739 1.01560199 -1.335752 -1.311052 setosa
## 2 -1.1392005 -0.13153881 -1.335752 -1.311052 setosa
## 3 -1.3807271 0.32731751 -1.392399 -1.311052 setosa
## 4 -1.5014904 0.09788935 -1.279104 -1.311052 setosa
## 5 -1.0184372 1.24503015 -1.335752 -1.311052 setosa
## 6 -0.5353840 1.93331463 -1.165809 -1.048667 setosa
data <- read.csv("agemoney.csv")
age10 <- mean(data[data$age >= 10 & data$age < 20, 2]) # 10이상 20미만, money 변수를 쓰기 위해 ,2 입력
age20 <- mean(data[data$age >= 20 & data$age < 30, 2]) # vector 변수를 연산하기 위해 '&' 사용
age30 <- mean(data[data$age >= 30 & data$age < 40, 2])
age40 <- mean(data[data$age >= 40 & data$age < 50, 2])
age50 <- mean(data[data$age >= 50 & data$age < 60, 2])
data.mean <- data.frame(age = c(10, 20, 30, 40, 50),
money = c(age10, age20, age30, age40, age50))
data.mean
date.txt <- c("2016-11-01", "2016-11-02", "2016-11-03", "2016-11-04")
class(date.txt)
## [1] "character"
date.as.date <- as.Date(date.txt)
date.as.date
## [1] "2016-11-01" "2016-11-02" "2016-11-03" "2016-11-04"
class(date.as.date)
## [1] "Date"
date.as.date.week.full <- format(date.as.date, format = "%Y-%m-%A")
date.as.date.week.only <- format(date.as.date, format = "%A")
date.as.date.week.full ; date.as.date.week.only
## [1] "2016-11-화요일" "2016-11-수요일" "2016-11-목요일" "2016-11-금요일"
## [1] "화요일" "수요일" "목요일" "금요일"
lvl <- factor(c("A", "B", "A", "A", "C")) # factor varialbes를 숫자인 dummy variables 로 변환
df <- data.frame(lvl)
head(df)
## lvl
## 1 A
## 2 B
## 3 A
## 4 A
## 5 C
dv <- model.matrix(~lvl, data = df); dv
## (Intercept) lvlB lvlC
## 1 1 0 0
## 2 1 1 0
## 3 1 0 0
## 4 1 0 0
## 5 1 0 1
## attr(,"assign")
## [1] 0 1 1
## attr(,"contrasts")
## attr(,"contrasts")$lvl
## [1] "contr.treatment"
dv1 <- model.matrix(~lvl, data = df)[, -1]; dv1 # intercept라는 불필요한 변수를 제거하기 위해
## lvlB lvlC
## 1 0 0
## 2 1 0
## 3 0 0
## 4 0 0
## 5 0 1
다음의 날짜 벡터에서 “화”, “수”, “토”, “화”만 출력되게 하시오.
date.txt <- c("2016-11-01", "2017-11-01", "2018-11-03", "2019-11-05")
date.as.date <- as.Date(date.txt)
date.as.date.week.only <- format(date.as.date, format = "%a") # A : "Tueday", a : "Tue"
date.as.date.week.only
## [1] "화" "수" "토" "화"
또한 년도와 월이 “16-11”, “17-11”, “18-11”,“19-11”로만 출력되게 하시오.
date.as.date.year.month <- format(date.as.date, format = "%y-%m") # Y : "2017", y : "17"
date.as.date.year.month
## [1] "16-11" "17-11" "18-11" "19-11"
Binnig 을 통해 타이타닉 탑승자의 10세 단위 별 평균 생존률을 구하시오.
data <- read.csv("train.csv")
min(data$Age, na.rm = T);max(data$Age, na.rm = T) # 범위 확인이 선행 되어야함
## [1] 0.42
## [1] 80
age0 <- mean(data[data$Age >= 0 & data$Age < 10, 6], na.rm = T)
age10 <- mean(data[data$Age >= 10 & data$Age < 20, 6], na.rm = T) # 10이상 20미만, age 변수를 쓰기 위해 ,6 입력
age20 <- mean(data[data$Age >= 20 & data$Age < 30, 6], na.rm = T) # vector 변수를 연산하기 위해 '&' 사용
age30 <- mean(data[data$Age >= 30 & data$Age < 40, 6], na.rm = T)
age40 <- mean(data[data$Age >= 40 & data$Age < 50, 6], na.rm = T)
age50 <- mean(data[data$Age >= 50 & data$Age < 60, 6], na.rm = T)
age60 <- mean(data[data$Age >= 60 & data$Age < 70, 6], na.rm = T)
age70 <- mean(data[data$Age >= 70 & data$Age <= 80, 6], na.rm = T)
data.mean <- data.frame(age = c(0, 10, 20, 30, 40, 50, 60, 70),
pclass = c(age0, age10, age20, age30, age40, age50, age60, age70))
data.mean
## age pclass
## 1 0 4.083387
## 2 10 16.779412
## 3 20 24.534091
## 4 30 33.910180
## 5 40 44.067416
## 6 50 53.447917
## 7 60 62.421053
## 8 70 72.357143
Binnig 을 통해 타이타닉 탑승자의 10세 단위 별 평균 좌석 등급을 구하시오.
min(data$Age, na.rm = T);max(data$Age, na.rm = T) # 범위 확인이 선행 되어야함
## [1] 0.42
## [1] 80
age0 <- mean(data[data$Age >= 0 & data$Age < 10, 3], na.rm = T)
age10 <- mean(data[data$Age >= 10 & data$Age < 20, 3], na.rm = T) # 10이상 20미만, Pclass 변수를 쓰기 위해 ,3 입력
age20 <- mean(data[data$Age >= 20 & data$Age < 30, 3], na.rm = T) # vector 변수를 연산하기 위해 '&' 사용
age30 <- mean(data[data$Age >= 30 & data$Age < 40, 3], na.rm = T)
age40 <- mean(data[data$Age >= 40 & data$Age < 50, 3], na.rm = T)
age50 <- mean(data[data$Age >= 50 & data$Age < 60, 3], na.rm = T)
age60 <- mean(data[data$Age >= 60 & data$Age < 70, 3], na.rm = T)
age70 <- mean(data[data$Age >= 70 & data$Age <= 80, 3], na.rm = T)
data.mean <- data.frame(age = c(0, 10, 20, 30, 40, 50, 60, 70),
pclass = c(age0, age10, age20, age30, age40, age50, age60, age70))
data.mean
## age pclass
## 1 0 2.629032
## 2 10 2.470588
## 3 20 2.450000
## 4 30 2.113772
## 5 40 1.966292
## 6 50 1.562500
## 7 60 1.473684
## 8 70 1.714286
타이타닉 탑승자의 성별을 원 핫 인코딩으로 나타내시오
data$Sex <- as.factor(data$Sex)
one.hot.s <- model.matrix(~Sex, data = data)[ , -1] # 해당 칼럼은 factor형으로 변환을 해야함
one.hot.s <- as.data.frame(one.hot.s) # 데이터 프레임으로 보는게 편해서 다시 변환
head(one.hot.s)
## one.hot.s
## 1 1
## 2 0
## 3 0
## 4 0
## 5 1
## 6 1
타이타닉 탑승자의 Embarke을 원 핫 인코딩으로 나타내시오
data$Embarked <- as.factor(data$Embarked)
one.hot.E <- model.matrix(~Embarked, data = data)[, -1]
one.hot.E <- as.data.frame(one.hot.E)
head(one.hot.E)
## EmbarkedC EmbarkedQ EmbarkedS
## 1 0 0 1
## 2 1 0 0
## 3 0 0 1
## 4 0 0 1
## 5 0 0 1
## 6 0 1 0
10개의 factor를 갖는 변수들을 원 핫 인코딩으로 나타내는 경우 몇 개의 변수로 분리되는가 - 9개로 분리된다
연령별 용돈의 평균을 조사한 자료 문제에서 이 자료를 표현하기 가장 적절한 그래프
data <- read.csv("agemoney.csv")
age10 <- mean(data[data$age >= 10 & data$age < 20, 2]) # 10이상 20미만, money 변수를 쓰기 위해 ,2 입력
age20 <- mean(data[data$age >= 20 & data$age < 30, 2]) # vector 변수를 연산하기 위해 '&' 사용
age30 <- mean(data[data$age >= 30 & data$age < 40, 2])
age40 <- mean(data[data$age >= 40 & data$age < 50, 2])
age50 <- mean(data[data$age >= 50 & data$age < 60, 2])
data.mean <- data.frame(age = c(10, 20, 30, 40, 50),
money = c(age10, age20, age30, age40, age50))
data.mean
## age money
## 1 10 5500
## 2 20 70000
## 3 30 211000
## 4 40 338000
## 5 50 393000
plot(data$age, data$money, col = "blue", main = "agemoney", xlab = "age", ylab = "money")
barplot(data.mean$money, xlab = "age", ylab = "money", main = "agemoney",
names.arg = c("10대", "20대", "30대", "40대", "50대"))
a <- read.csv("mtest.csv")
head(a)
x <- c(1, 2, 3, 4, 5)
y <- c(10, 20, 30, 40, 50)
z <- c("M", "M", "M", "F", "F")
d <- data.frame(x, y, z) ;
str(d)
## 'data.frame': 5 obs. of 3 variables:
## $ x: num 1 2 3 4 5
## $ y: num 10 20 30 40 50
## $ z: Factor w/ 2 levels "F","M": 2 2 2 1 1
id <- c(1, 2, 3, 4, 5)
factory <- c("평택지부1", "평택지부2", "안산지부", "인천지부", "군포지부")
salse <- c(70, 90, 80, 85, 87)
d <- data.frame(id, factory, salse, stringsAsFactors = FALSE)
id <- c(1, 2, 3, 4, 5)
factory <- c("평택지부1", "평택지부2", "안산지부", "인천지부", "군포지부")
type <- c("공장","공장","오피스","오피스","오피스")
salse <- c(70, 90, 80, 85, 87)
d <- data.frame(id, factory, type, salse, stringsAsFactors = FALSE)
str(d)
## 'data.frame': 5 obs. of 4 variables:
## $ id : num 1 2 3 4 5
## $ factory: chr "평택지부1" "평택지부2" "안산지부" "인천지부" ...
## $ type : chr "공장" "공장" "오피스" "오피스" ...
## $ salse : num 70 90 80 85 87
d$type <- as.factor(d$type) # as.character(), as.numeric()
id <- 1:7
name <- c("김원경", "박찬웅", " 조해선", " 김선영", "이화영", "양영욱", "최필선")
gender <- c("F", "M", "F", "F", "F", "M", "M")
sales <- c(1000, 2000, 1500, 2200, 1700, 2000, 2200)
d <- data.frame(id, name, gender, sales, stringsAsFactors = FALSE)
d
## id name gender sales
## 1 1 김원경 F 1000
## 2 2 박찬웅 M 2000
## 3 3 조해선 F 1500
## 4 4 김선영 F 2200
## 5 5 이화영 F 1700
## 6 6 양영욱 M 2000
## 7 7 최필선 M 2200
# name 칼럼을 불러오는 동일한 방법
d.col <- d$name
d.col <- d[ ,2]
d.col <- d[, "name"]
d.col
## [1] "김원경" "박찬웅" " 조해선" " 김선영" "이화영" "양영욱" "최필선"
d.cols <- d[,c(2,3,4)]
d.cols <- d[, 2:4]
d.cols <- d[, c("name", "gender", "sales")]
d.row <- d[2, ]
d.elemnet <- d[2,3]
dm <- d[d$gender == "M", ]
df <- d[d$gender == "F", ]
d[d$sales > 2000,]
## id name gender sales
## 4 4 김선영 F 2200
## 7 7 최필선 M 2200
d[d$gender == "F" & d$sales > 2000, ]
## id name gender sales
## 4 4 김선영 F 2200
d[d$gender == "F" | d$sales > 2000, ]
## id name gender sales
## 1 1 김원경 F 1000
## 3 3 조해선 F 1500
## 4 4 김선영 F 2200
## 5 5 이화영 F 1700
## 7 7 최필선 M 2200
d.man <- d[which(d$gender == "M"), ]
d.man
## id name gender sales
## 2 2 박찬웅 M 2000
## 6 6 양영욱 M 2000
## 7 7 최필선 M 2200
d.woman <- d[which(d$gender == "F"), ]
d.woman
## id name gender sales
## 1 1 김원경 F 1000
## 3 3 조해선 F 1500
## 4 4 김선영 F 2200
## 5 5 이화영 F 1700
d[which(d$sales > 2000),]
## id name gender sales
## 4 4 김선영 F 2200
## 7 7 최필선 M 2200
d[which(d$gender == "F" & d$sales > 2000), ]
## id name gender sales
## 4 4 김선영 F 2200
d[which(d$gender == "F" | d$sales > 2000), ]
## id name gender sales
## 1 1 김원경 F 1000
## 3 3 조해선 F 1500
## 4 4 김선영 F 2200
## 5 5 이화영 F 1700
## 7 7 최필선 M 2200
d.man <- subset(d, gender == "M")
d.man
## id name gender sales
## 2 2 박찬웅 M 2000
## 6 6 양영욱 M 2000
## 7 7 최필선 M 2200
d.woman <- subset(d, gender == "F")
d.woman
## id name gender sales
## 1 1 김원경 F 1000
## 3 3 조해선 F 1500
## 4 4 김선영 F 2200
## 5 5 이화영 F 1700
subset(d, gender == "F", select = c(name, gender))
## name gender
## 1 김원경 F
## 3 조해선 F
## 4 김선영 F
## 5 이화영 F
subset(d, salse > 2000)
## [1] id name gender sales
## <0 rows> (or 0-length row.names)
subset(d, d$gender == "F" & d$sales > 2000)
## id name gender sales
## 4 4 김선영 F 2200
subset(d, d$gender == "F" | d$sales > 2000)
## id name gender sales
## 1 1 김원경 F 1000
## 3 3 조해선 F 1500
## 4 4 김선영 F 2200
## 5 5 이화영 F 1700
## 7 7 최필선 M 2200
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
d.man <- d %>% filter(gender == "M")
d.man
## id name gender sales
## 1 2 박찬웅 M 2000
## 2 6 양영욱 M 2000
## 3 7 최필선 M 2200
d %>% filter(sales> 2000)
## id name gender sales
## 1 4 김선영 F 2200
## 2 7 최필선 M 2200
d %>% filter(d$gender == "F" & d$sales > 2000)
## id name gender sales
## 1 4 김선영 F 2200
d %>% filter(d$gender == "F" | d$sales > 2000)
## id name gender sales
## 1 1 김원경 F 1000
## 2 3 조해선 F 1500
## 3 4 김선영 F 2200
## 4 5 이화영 F 1700
## 5 7 최필선 M 2200
sales가 5 넘는 행을 추출하시오
a <- read.csv("mtest.csv")
a1 <- subset(a, sales > 5)
a1
## item price sales
## 1 풍선 100 200
## 3 테이프 300 20
## 5 색종이 150 100
price가 300 이상이면서 sales가 5 이상인 행을 추출하시오
a2 <- subset(a, price >= 300 & sales >=5)
a2
## item price sales
## 2 펌프 3000 5
## 3 테이프 300 20
subset 함수를 이용하여 #2의 item 칼럼만 추출하시오
a3 <- subset(a2, select = c(item), )
a3
## item
## 2 펌프
## 3 테이프
d[, -2] # 특정 칼럼 제외
## id gender sales
## 1 1 F 1000
## 2 2 M 2000
## 3 3 F 1500
## 4 4 F 2200
## 5 5 F 1700
## 6 6 M 2000
## 7 7 M 2200
d[-2, ] # 특정 행 제외
## id name gender sales
## 1 1 김원경 F 1000
## 3 3 조해선 F 1500
## 4 4 김선영 F 2200
## 5 5 이화영 F 1700
## 6 6 양영욱 M 2000
## 7 7 최필선 M 2200
d[c(-1, -2)] # 여러 칼럼 제외
## gender sales
## 1 F 1000
## 2 M 2000
## 3 F 1500
## 4 F 2200
## 5 F 1700
## 6 M 2000
## 7 M 2200
x1 <- c(1, 2, 3)
x2 <- c(4, 5, 6)
x3 <- c(7, 8, 9)
d <- data.frame(x1, x2, x3)
x4 <- c(10, 11, 12)
d2 <- cbind(d, x4)
x5 <- c("James", "Mary", "Tony")
d3 <- cbind(d2, x5, stringAsFactors = FALSE) # factor로 붙이지 않기 위해 charactor
d$x5 <- x5 # 이때는 stringAsFactors 옵션 없이도 character로 들어감
d
## x1 x2 x3 x5
## 1 1 4 7 James
## 2 2 5 8 Mary
## 3 3 6 9 Tony
d$sum <- d$x1 + d$x2 + d$x3
d$sum
## [1] 12 15 18
d$pass <- ifelse(d$sum > 15, "pass", "fail")
d$pass
## [1] "fail" "fail" "pass"
a <- data.frame(id = c(1, 2, 3, 4, 5), mid = c(30, 40, 50, 60, 70))
b <- data.frame(id = c(5, 4, 3, 2, 1), final = c(70, 90, 100, 90, 80))
a;b
## id mid
## 1 1 30
## 2 2 40
## 3 3 50
## 4 4 60
## 5 5 70
## id final
## 1 5 70
## 2 4 90
## 3 3 100
## 4 2 90
## 5 1 80
left_join(a, b, by = "id")
## id mid final
## 1 1 30 80
## 2 2 40 90
## 3 3 50 100
## 4 4 60 90
## 5 5 70 70
old1 <- c(1, 2, 3)
old2 <- c(4, 5, 6)
타이타닉 데이터 셋을 남성과 여성으로 나누어 구분하고, 각각에 대한 생존율을 구하시오.
data <- read.csv("train.csv")
head(data)
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## Name Sex Age SibSp
## 1 Braund, Mr. Owen Harris male 22 1
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1
## 3 Heikkinen, Miss. Laina female 26 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1
## 5 Allen, Mr. William Henry male 35 0
## 6 Moran, Mr. James male NA 0
## Parch Ticket Fare Cabin Embarked
## 1 0 A/5 21171 7.2500 S
## 2 0 PC 17599 71.2833 C85 C
## 3 0 STON/O2. 3101282 7.9250 S
## 4 0 113803 53.1000 C123 S
## 5 0 373450 8.0500 S
## 6 0 330877 8.4583 Q
data.m <- subset(data, Sex == "male")
data.f <- subset(data, Sex == "female")
mean(data.m$Survived, na.rm = T) ; mean(data.f$Survived, na.rm = T)
## [1] 0.1889081
## [1] 0.7420382
남성이면서, 10대 미만, 10대 , 20대의 생존율을 구하시오.
data.m0 <- subset(data.m, Age < 10)
data.m10 <- subset(data.m, Age >= 10 & Age < 20)
data.m20 <- subset(data.m, Age >= 20 & Age < 30)
mean(data.m0$Survived); mean(data.m10$Survived); mean(data.m20$Survived)
## [1] 0.59375
## [1] 0.122807
## [1] 0.1689189
여성이면서, 10대 미만, 10대 , 20대의 생존율을 구하시오.
data.f0 <- subset(data.f, Age < 10)
data.f10 <- subset(data.f, Age >= 10 & Age < 20)
data.f20 <- subset(data.f, Age >= 20 & Age < 30)
mean(data.f0$Survived); mean(data.f10$Survived); mean(data.f20$Survived)
## [1] 0.6333333
## [1] 0.7555556
## [1] 0.7222222
sample(1:10,5)
sample(1:10,10)
split(iris, iris$Species) # 문자와 팩터로 보여준다.
# 출력되는 결과'$' 리스트 임을 알 수 있다.
# iris $ 을 사용하면 따라오는 주머니 3개가 있다. - 리스트
split(iris, 1:10) # 데이터를 10개의 집합으로 분류한다.
a <- 4
ifelse(a == 3, " 3 입 니 다", "3 이 아 닙 니 다")
## [1] "3 이 아 닙 니 다"
autoparts <- read.csv("autoparts.csv")
autoparts1 <- autoparts[autoparts$prod_no=="90784-76001", c(2:11)]
autoparts2 <- autoparts1[autoparts1$c_thickness < 1000, ]
autoparts2$y_faulty <- ifelse(autoparts2$c_thickness < 20 | (autoparts2$c_thickness > 32), 1, 0)
head(autoparts2$y_faulty) # 불량이면 1, 정상이면(20 ~ 32) 0 입력하도록
## [1] 0 0 0 0 0 0
autoparts2$g_class <- as.factor(ifelse(autoparts2$c_thickness < 20, 1,
ifelse(autoparts2$c_thickness < 32, 2, 3)))
autoparts2$g_class # 20 미만이면 1 , 아니면 두 번째 조건. 두 번째 조건에서 32 미만이면 2, 아니면 3
15세 미만이면서 parch가 0 보다 큰 경우에 해당하는 탑승객은 몇 명인가? (parch는 탑승객의 부모의 수 혹은 자식의 수를 의미한다.)
data <- read.csv("train.csv")
data$j <- ifelse(data$Parch > 0 & (data$Age < 15), 1, 0)
as.numeric(data$j)
## [1] 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0
## [24] 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## [47] 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 NA 0 0 0
## [70] 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## [93] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [116] 0 0 0 0 1 0 0 0 0 0 0 0 0 NA 0 0 0 0 0 0 0 0 0
## [139] 0 0 NA 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 NA 0
## [162] 0 0 0 1 1 NA 0 0 0 0 1 1 0 0 0 NA 0 0 0 NA 0 1 1
## [185] 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 NA 0 0 0 1 0
## [208] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 NA
## [231] 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [254] 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [277] 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
## [300] 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [323] 0 0 NA 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## [346] 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [369] 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0
## [392] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 NA 0 0 0 0
## [415] 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
## [438] 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0
## [461] 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0
## [484] 0 0 NA 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [507] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [530] 0 1 0 0 NA 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0
## [553] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [576] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 NA 0 0 0 0
## [599] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## [622] 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0
## [645] 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [668] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
## [691] 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 NA 0 0 0
## [714] 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [737] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0
## [760] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [783] 0 NA 0 0 0 1 1 0 0 0 NA 0 0 0 0 0 0 0 0 0 1 1 0
## [806] 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 1
## [829] 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 NA 0 0 0 1
## [852] 0 1 0 0 0 0 0 0 0 0 0 0 NA 0 0 0 0 0 1 0 0 0 0
## [875] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 NA 0 0
sum(data$j, na.rm = T)
## [1] 70
split 함수를 이용하여, 15세 미만이면서 parch가 0인 그룹의 생존률과, parch가 1 이상인 그룹의 생존률을 구해라
data$under15 <- ifelse(data$Age < 15, 1, 0)
data$under15 <- split(data$Parch, data$Parch > 1)
a <- split(data, data$Parch == 0)
b <- split(data, data$Parch > 1)
mean(a$Survived, na.rm = T)
mean(b$Survived)
x <- c(1, NA, 3, 2, 2)
is.na(x)
## [1] FALSE TRUE FALSE FALSE FALSE
dim(autoparts)
## [1] 34139 11
sum(is.na(autoparts)) # is.na() 를 이용하면 결측치 TRUE로 출력
## [1] 0
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.5.1
head(french_fries)
## time treatment subject rep potato buttery grassy rancid painty
## 61 1 1 3 1 2.9 0.0 0.0 0.0 5.5
## 25 1 1 3 2 14.0 0.0 0.0 1.1 0.0
## 62 1 1 10 1 11.0 6.4 0.0 0.0 0.0
## 26 1 1 10 2 9.9 5.9 2.9 2.2 0.0
## 63 1 1 15 1 1.2 0.1 0.0 1.1 5.1
## 27 1 1 15 2 8.8 3.0 3.6 1.5 2.3
french_fries[!complete.cases(french_fries), ]
## time treatment subject rep potato buttery grassy rancid painty
## 315 5 3 15 1 NA NA NA NA NA
## 455 7 2 79 1 7.3 NA 0.0 0.7 0
## 515 8 1 79 1 10.5 NA 0.0 0.5 0
## 520 8 2 16 1 4.5 NA 1.4 6.7 0
## 563 8 2 79 2 5.7 0 1.4 2.3 NA
which(!complete.cases(french_fries))
## [1] 341 477 525 535 550
na.idx <- which(!complete.cases(french_fries))
df.new <- french_fries[-na.idx, ]
x <- c(1, NA, 3, 2 ,2)
mean(x, na.rm = TRUE)
## [1] 2
x <- 1:5
y <- c(2, 4, NA, 8, 10)
df <- data.frame(x, y)
na.omit(df); na.exclude(df) # NA를 제외하고 출력
na.fail(df) # NA가 있으면 실패
na.pass(df) # NA 여부에 상관없이 출ㄹ
#argument 옵션 으로 제거
resid(lm(y ~ x, data = df, na.action = na.omit))
## 1 3 4 5
## -12.5 -12.5 7.5 17.5
resid(lm(y ~ x, data = df, na.action = na.exclude))
## 1 2 3 4 5
## -12.5 NA -12.5 7.5 17.5
x <- c(1, 2, 3, NA, 5, NA, 7)
is.na(x) <- 0
french_fries$buttery[is.na(french_fries$buttery)] <- 0
french_fries[!complete.cases(french_fries), ]
## time treatment subject rep potato buttery grassy rancid painty
## 315 5 3 15 1 NA 0 NA NA NA
## 563 8 2 79 2 5.7 0 1.4 2.3 NA
df[is.na(df)]
## character(0)
x[is.na(x)] <- mean(x, na.rm = TRUE)
x
## [1] 1.0 2.0 3.0 3.6 5.0 3.6 7.0
french_fries$buttery[is.na(french_fries$buttery)] <- mean(french_fries$buttery, na.rm = TRUE)
french_fries[!complete.cases(french_fries), ]
## time treatment subject rep potato buttery grassy rancid painty
## 315 5 3 15 1 NA 0 NA NA NA
## 563 8 2 79 2 5.7 0 1.4 2.3 NA
타이타닉 데이터에서 결측치가 존재하는 행은 모두 몇 개 인가?
data <- read.csv("train.csv")
sum(is.na(data))
## [1] 177
sum(data$Cabin == "")
## [1] 687
sum(data$Embarked == "")
## [1] 2
각 변수에 대해 결측치가 존재하는지를 찾아보고 변수 별 결측치의 개수를 탐색하시오
# for문으로 돌리기
for (i in 1:length(names(data))) {
name = names(data)
print(paste("number of Na in", name[i], "is", sum(is.na(data[i]))))
}
## [1] "number of Na in PassengerId is 0"
## [1] "number of Na in Survived is 0"
## [1] "number of Na in Pclass is 0"
## [1] "number of Na in Name is 0"
## [1] "number of Na in Sex is 0"
## [1] "number of Na in Age is 177"
## [1] "number of Na in SibSp is 0"
## [1] "number of Na in Parch is 0"
## [1] "number of Na in Ticket is 0"
## [1] "number of Na in Fare is 0"
## [1] "number of Na in Cabin is 0"
## [1] "number of Na in Embarked is 0"
나이에 대한 결측값을 나이의 median으로 치환하시오
data$Age[is.na(data$Age)] <- median(data$Age)
제거 - 오타, 오류 비상식적 응답과 같은 경우 단순히 제거
치환 - 제거가 어려운 경우 평균, 최빈값, 중앙값, 예측값 등으로 치환 - 행수가 보존 된다는 장점이 있지만 신뢰도에 문제가 생길 수 있음.
분리 - 독리변수가 충분히 세분되지 않은 경우 이상치가 발생할 수 있다. 이러한 경우에는 변수를 세분화하여 이상치를 분리한다.
autoparts <- read.csv("autoparts.csv")
autoparts3 <- autoparts[autoparts$prod_no == "45231-3B610", -1]
myboxplot <- boxplot(autoparts3$c_thickness) # boxplot으로 이상치 확인
myboxplot$out # 이상치 값들이 출력된다.
## [1] 33.5 33.5 33.5 34.1 18.1 17.1 16.6 17.9 18.0 37.3 32.5 13.3 32.3 9.3
## [15] 32.3 33.3 33.3 32.5 33.4 33.6 34.5 36.2 33.6 32.3 34.6 36.5 7.9 32.3
## [29] 32.6 10.3 16.5 17.0 5.9 17.8 36.3 36.9 36.6 37.1 36.6 39.5 38.8 39.0
## [43] 40.6 49.5 48.6 48.9 49.1 48.8 47.5 56.6
data <- autoparts3$c_thickness
which(data < fivenum(data)[2] - 1.5* IQR(data)) #fivenum(data)[2] 는 제 1사분위수
## [1] 313 316 416 418 424 568 893 1484 1768 1771 1783 1877 2120
which(data > fivenum(data)[4] + 1.5* IQR(data)) #fivenum(data)[4] 는 제 3사분위수
## [1] 305 306 307 308 461 564 748 1121 1275 1276 1277 1279 1408 1409
## [15] 1410 1411 1413 1482 1483 1591 1592 2363 2364 2365 2366 2367 2368 2369
## [29] 2370 2371 2372 2373 2374 2375 2376 2377 2378
사분위수를 이용한 방법으로 이상치를 검출하고, 그 결과가 boxplot이 제시한 결과와 같다는 것을 확인하시오
data <- autoparts3$c_thickness
outlier1 <- which(data < fivenum(data)[2] - 1.5*IQR(data)) #fivenum(data)[2] 는 제 1사분위수
outlier2 <- which(data > fivenum(data)[4] + 1.5*IQR(data)) #fivenum(data)[4] 는 제 3사분위수
outlier3 <- autoparts3[outlier1, ]
outlier4 <- autoparts3[outlier2, ]
a <- outlier3$c_thickness
b <- outlier4$c_thickness
c <- c(a,b)
c.1 <- sort(c)
d.1 <- sort(myboxplot$out)
c.1 == d.1
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [15] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [29] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [43] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
m <- lm(c_thickness ~ .,data = autoparts3)
head(rstudent(m), 10)
## 18780 18781 18782 18783 18784 18785
## -0.01545566 -0.29668984 0.13522741 -0.14901718 0.32864459 0.15521404
## 18786 18787 18788 18789
## 0.34504640 -0.02273073 -0.06916661 -0.11587747
잔차 그림
plot(rstudent(m), main = "Studentized Residual")
패키지기 활용하기
#install.packages("car")
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
outlierTest(m)
## rstudent unadjusted p-value Bonferonni p
## 21157 14.215108 4.5579e-44 1.0839e-40
## 20262 12.182360 3.7124e-33 8.8281e-30
## 19670 11.297318 7.3111e-29 1.7386e-25
## 21151 10.834554 9.8972e-27 2.3536e-23
## 20655 10.355221 1.3170e-24 3.1319e-21
## 21153 10.226805 4.7210e-24 1.1227e-20
## 21154 10.164636 8.7141e-24 2.0722e-20
## 21155 10.046863 2.7573e-23 6.5570e-20
## 21152 10.018596 3.6290e-23 8.6296e-20
## 20193 -9.002669 4.4145e-19 1.0498e-15
x <- outlierTest(m)
x
## rstudent unadjusted p-value Bonferonni p
## 21157 14.215108 4.5579e-44 1.0839e-40
## 20262 12.182360 3.7124e-33 8.8281e-30
## 19670 11.297318 7.3111e-29 1.7386e-25
## 21151 10.834554 9.8972e-27 2.3536e-23
## 20655 10.355221 1.3170e-24 3.1319e-21
## 21153 10.226805 4.7210e-24 1.1227e-20
## 21154 10.164636 8.7141e-24 2.0722e-20
## 21155 10.046863 2.7573e-23 6.5570e-20
## 21152 10.018596 3.6290e-23 8.6296e-20
## 20193 -9.002669 4.4145e-19 1.0498e-15
x$rstudent #
## 21157 20262 19670 21151 20655 21153 21154
## 14.215108 12.182360 11.297318 10.834554 10.355221 10.226805 10.164636
## 21155 21152 20193
## 10.046863 10.018596 -9.002669
names(x$rstudent) # 몇 번째 행이 잔차인지
## [1] "21157" "20262" "19670" "21151" "20655" "21153" "21154" "21155"
## [9] "21152" "20193"
m <- lm(c_thickness ~ .,data = autoparts3)
plot(m)
cooks <- cooks.distance(m)
plot(cooks, pch = "ㅋ", cex = 1.5, main = "Plot for Cook's Distance")
text(x = 1:length(cooks), y = cooks, labels = ifelse(cooks > 4/nrow(autoparts3),
names(cooks), " "), col = "red")
influential <- names(cooks)[(cooks)> 4/nrow(autoparts3)]
influential
## [1] "18867" "19052" "19084" "19085" "19086" "19087" "19240" "19342"
## [9] "19343" "19344" "19345" "19346" "19347" "19496" "19497" "19499"
## [17] "19526" "19527" "19528" "19529" "19669" "19670" "19671" "19672"
## [25] "19803" "19804" "19897" "19899" "19900" "19901" "19995" "20054"
## [33] "20055" "20056" "20057" "20058" "20187" "20190" "20191" "20192"
## [41] "20193" "20261" "20262" "20263" "20370" "20371" "20374" "20375"
## [49] "20376" "20521" "20522" "20546" "20547" "20549" "20551" "20552"
## [57] "20560" "20561" "20585" "20586" "20617" "20654" "20655" "20656"
## [65] "20685" "20686" "20687" "20688" "20728" "20729" "20730" "20833"
## [73] "20900" "20901" "20981" "20982" "20983" "20984" "20993" "21009"
## [81] "21010" "21011" "21017" "21018" "21019" "21020" "21021" "21127"
## [89] "21128" "21129" "21130" "21131" "21132" "21133" "21134" "21135"
## [97] "21136" "21137" "21138" "21139" "21140" "21142" "21147" "21148"
## [105] "21149" "21150" "21151" "21152" "21153" "21154" "21155" "21156"
## [113] "21157"
행의 이름들을 가져오는데, a%in%b : a가 b에 있는지
autoparts3[rownames(autoparts3)%in%influential, ]
#install.packages("outliers")
library(outliers)
outlier(autoparts3$c_thickness) # 평균에서 가장 멀리 떨어진 최대값
## [1] 56.6
outlier(autoparts3$c_thickness, opposite = TRUE) # 최소값
## [1] 5.9
install.packages("DMwR") # 패키지 오류나서 코드만 입력ㅎ
library(DMwR)
score <- lofactor(autoparts3, k = 5)
plot(score)
top3 <- oder(score, decreasing = TRUE)[1:5]
top3
타이타닉 데이터 이상치 확인하기
data <- read.csv("train.csv")
head(data)
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## Name Sex Age SibSp
## 1 Braund, Mr. Owen Harris male 22 1
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1
## 3 Heikkinen, Miss. Laina female 26 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1
## 5 Allen, Mr. William Henry male 35 0
## 6 Moran, Mr. James male NA 0
## Parch Ticket Fare Cabin Embarked
## 1 0 A/5 21171 7.2500 S
## 2 0 PC 17599 71.2833 C85 C
## 3 0 STON/O2. 3101282 7.9250 S
## 4 0 113803 53.1000 C123 S
## 5 0 373450 8.0500 S
## 6 0 330877 8.4583 Q
boxplot(data)
# 운임에 이상값이 있다고 할 수 있다.
운임 이상값 출력
myboxplot <- boxplot(data$Fare)
운임 이상치를 제거한 데이터
data <- read.csv("train.csv")
data1 <- data$Fare
out.under <- which(data1 < fivenum(data1)[2] - 1.5* IQR(data1))
out.upper <- which(data1 > fivenum(data1)[4] + 1.5* IQR(data1))
NROW(out.under) # under 로는 이상치가 없다.
## [1] 0
NROW(out.upper) # 116개의 이상치 확인.
## [1] 116
data1 <- data[-out.upper, ]
NROW(data) - NROW(data1)
## [1] 116
이상치 대체하기 - 평균값으로 대체하기
data.2 <- data
fare.mean <- mean(data.2$Fare)
data.2$Fare[out.upper] <- fare.mean
data.2[out.upper, "Fare"]
## [1] 32.20421 32.20421 32.20421 32.20421 32.20421 32.20421 32.20421
## [8] 32.20421 32.20421 32.20421 32.20421 32.20421 32.20421 32.20421
## [15] 32.20421 32.20421 32.20421 32.20421 32.20421 32.20421 32.20421
## [22] 32.20421 32.20421 32.20421 32.20421 32.20421 32.20421 32.20421
## [29] 32.20421 32.20421 32.20421 32.20421 32.20421 32.20421 32.20421
## [36] 32.20421 32.20421 32.20421 32.20421 32.20421 32.20421 32.20421
## [43] 32.20421 32.20421 32.20421 32.20421 32.20421 32.20421 32.20421
## [50] 32.20421 32.20421 32.20421 32.20421 32.20421 32.20421 32.20421
## [57] 32.20421 32.20421 32.20421 32.20421 32.20421 32.20421 32.20421
## [64] 32.20421 32.20421 32.20421 32.20421 32.20421 32.20421 32.20421
## [71] 32.20421 32.20421 32.20421 32.20421 32.20421 32.20421 32.20421
## [78] 32.20421 32.20421 32.20421 32.20421 32.20421 32.20421 32.20421
## [85] 32.20421 32.20421 32.20421 32.20421 32.20421 32.20421 32.20421
## [92] 32.20421 32.20421 32.20421 32.20421 32.20421 32.20421 32.20421
## [99] 32.20421 32.20421 32.20421 32.20421 32.20421 32.20421 32.20421
## [106] 32.20421 32.20421 32.20421 32.20421 32.20421 32.20421 32.20421
## [113] 32.20421 32.20421 32.20421 32.20421
fare.freq <- table(data$Fare)
fare.max.freq <- which.max(fare.freq)