빅데이터분석 4일차 실습

library(dplyr)

## 
## 다음의 패키지를 부착합니다: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(caret)

## 필요한 패키지를 로딩중입니다: ggplot2

## 필요한 패키지를 로딩중입니다: lattice

library(lubridate)

## 
## 다음의 패키지를 부착합니다: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(plyr)

## ------------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## ------------------------------------------------------------------------------

## 
## 다음의 패키지를 부착합니다: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

library(mlbench)
setwd("C:/data")
df<-read.csv("pub.csv")
colSums(is.na(df))

##          X       Case Restaurant      Price       Food      Decor    Service 
##          0          0          0          0          0          0          0 
##       East   latitude  longitude 
##          0          0          0

glimpse(df)

## Rows: 165
## Columns: 10
## $ X          <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
## $ Case       <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
## $ Restaurant <chr> "Daniella Ristorante", "Tello's Ristorante", "Biricchino", …
## $ Price      <int> 43, 32, 34, 41, 54, 52, 34, 34, 39, 44, 45, 47, 52, 35, 47,…
## $ Food       <int> 22, 20, 21, 20, 24, 22, 22, 20, 22, 21, 19, 21, 21, 19, 20,…
## $ Decor      <int> 18, 19, 13, 20, 19, 22, 16, 18, 19, 17, 17, 19, 19, 17, 18,…
## $ Service    <int> 20, 19, 18, 17, 21, 21, 21, 21, 22, 19, 20, 21, 20, 19, 21,…
## $ East       <int> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ latitude   <dbl> 40.74683, 40.74342, 40.74886, 40.74848, 40.73958, 40.74069,…
## $ longitude  <dbl> -73.99676, -73.99954, -73.99552, -74.00331, -73.99591, -73.…

df1<-df[,c(4,5,6,7,8)]
set.seed(453)
glimpse(df1)

## Rows: 165
## Columns: 5
## $ Price   <int> 43, 32, 34, 41, 54, 52, 34, 34, 39, 44, 45, 47, 52, 35, 47, 37…
## $ Food    <int> 22, 20, 21, 20, 24, 22, 22, 20, 22, 21, 19, 21, 21, 19, 20, 21…
## $ Decor   <int> 18, 19, 13, 20, 19, 22, 16, 18, 19, 17, 17, 19, 19, 17, 18, 19…
## $ Service <int> 20, 19, 18, 17, 21, 21, 21, 21, 22, 19, 20, 21, 20, 19, 21, 21…
## $ East    <int> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…

idx<-createDataPartition(df1$Price, p=0.75, list= FALSE)
train <- df1[idx,]
test <- df1[-idx,]

control <- trainControl(method="cv", number=5)
model <- train(Price~., data=train, method="lm",
               preProcess=c("center","scale"),trControl=control)

pred <- predict(model, newdata=test)

error<-test$Price-pred
sqrt(mean(error^2))

## [1] 4.067108

빅데이터분석 4일차 실습

choi doo jin

2024-02-01