knitr::opts_knit$set(root.dir = "/Users/MYMAC/Desktop/Dataset")
library("forecast")
ApplianceShipments.csv 파일에는 1985년부터 1989년 사이의 미국 가전제품 분기별 shipments(출하량, 단위: 백만 달러) 데이터가 들어있다.
appliance_data = read.csv("ApplianceShipments.csv")
appliance_data
## Quarter Shipments X Data.courtesy.Ken.Black
## 1 Q1-1985 4009 NA NA
## 2 Q2-1985 4321 NA NA
## 3 Q3-1985 4224 NA NA
## 4 Q4-1985 3944 NA NA
## 5 Q1-1986 4123 NA NA
## 6 Q2-1986 4522 NA NA
## 7 Q3-1986 4657 NA NA
## 8 Q4-1986 4030 NA NA
## 9 Q1-1987 4493 NA NA
## 10 Q2-1987 4806 NA NA
## 11 Q3-1987 4551 NA NA
## 12 Q4-1987 4485 NA NA
## 13 Q1-1988 4595 NA NA
## 14 Q2-1988 4799 NA NA
## 15 Q3-1988 4417 NA NA
## 16 Q4-1988 4258 NA NA
## 17 Q1-1989 4245 NA NA
## 18 Q2-1989 4900 NA NA
## 19 Q3-1989 4585 NA NA
## 20 Q4-1989 4533 NA NA
appliance_data = appliance_data[,-3:-4]
appliance_data
## Quarter Shipments
## 1 Q1-1985 4009
## 2 Q2-1985 4321
## 3 Q3-1985 4224
## 4 Q4-1985 3944
## 5 Q1-1986 4123
## 6 Q2-1986 4522
## 7 Q3-1986 4657
## 8 Q4-1986 4030
## 9 Q1-1987 4493
## 10 Q2-1987 4806
## 11 Q3-1987 4551
## 12 Q4-1987 4485
## 13 Q1-1988 4595
## 14 Q2-1988 4799
## 15 Q3-1988 4417
## 16 Q4-1988 4258
## 17 Q1-1989 4245
## 18 Q2-1989 4900
## 19 Q3-1989 4585
## 20 Q4-1989 4533
appliance_timeseries = ts(appliance_data$Shipments, start = c(1985,1), end = c(1989,4),freq=4)
plot(appliance_timeseries,main = "Appliance Shipments", xlab = "Year", ylab = "Shipments", ylim = c(3800,5100))
b.분기별 패턴이 보이는가? 패턴을 자세히 보려면 y축 값 3500~5000 범위를 확대해 보시오.
plot(appliance_timeseries,main = "Appliance Shipments", xlab = "Year", ylab = "Shipments", ylim = c(3500,5000))
c.R을 이용하여 Q1,Q2,Q3,Q4에 대해 한 개의 차트에 네 개의 선으로 꺾은선 그래프를 그리시오. R에서는 data.frame을 생성하여 각 분기의 데이터를 담고, 그것을 선 그래프로 그린다. y축 값 3500~5000 범위를 확대해 보시오. 분기별로 차이점이 보이는가?
Quater_only = rep(0,20)
tmp = cbind(appliance_data, Quater_only)
for (i in 1:length(rownames(tmp))){
if (grepl("Q1", tmp[i,1]) == TRUE){
tmp[i,3] = "Q1"
}
if (grepl("Q2", tmp[i,1]) == TRUE){
tmp[i,3] = "Q2"
}
if (grepl("Q3", tmp[i,1]) == TRUE){
tmp[i,3] = "Q3"
}
if (grepl("Q4", tmp[i,1]) == TRUE){
tmp[i,3] = "Q4"
}
}
Q1_data = subset(tmp, Quater_only == "Q1")
Q2_data = subset(tmp, Quater_only == "Q2")
Q3_data = subset(tmp, Quater_only == "Q3")
Q4_data = subset(tmp, Quater_only == "Q4")
Q1_timeseries = ts(Q1_data$Shipments, start = c(1985), end = c(1989))
Q2_timeseries = ts(Q2_data$Shipments, start = c(1985), end = c(1989))
Q3_timeseries = ts(Q3_data$Shipments, start = c(1985), end = c(1989))
Q4_timeseries = ts(Q4_data$Shipments, start = c(1985), end = c(1989))
plot(Q1_timeseries,main = "Shipments by Quater", xlab = "Year", ylab = "Shipments", ylim = c(3500,5000), bty = "l",type = "b", lwd = 1.5, lty = 1, col = 1, pch = 1)
lines(Q2_timeseries, type="b", lwd=1.5, lty=2, col=2, pch=2)
lines(Q3_timeseries, type="b", lwd=1.5, lty=3, col=3, pch=3)
lines(Q4_timeseries, type="b", lwd=1.5, lty=4, col=4, pch=4)
colors = c(1:4)
linetype = c(1:4)
plotchar = c(1:4)
legend(1989,4000, 1:4, cex=0.8, col=colors, pch=plotchar, lty=linetype, title="Quarter", xpd=NA)
xrange = c(1985,1989)
yrange = range(3500,5000)
colors = c(1:4)
linetype = c(1:4)
plotchar = c(1:4)
x_cord = c(1985:1989)
plot(xrange, yrange, main = "Shipments by Quarter", type = "n", xlab = "Year", ylab = "Shipments", bty = "l")
for (i in 1:4) {
current_quarter = subset(appliance_timeseries, cycle(appliance_timeseries) == i)
lines(x_cord,current_quarter, type="b", lwd=1.5, lty=linetype[i], col=colors[i], pch=plotchar[i])}
legend(1989,4000, 1:4, cex=0.8, col=colors, pch=plotchar, lty=linetype, title="Quarter", xpd=NA)
annual_sum = aggregate(appliance_timeseries, FUN = sum)
xrange = c(1985:1989)
plot(xrange,annual_sum, main = "Sum of Shipments by Annually", type = "l", xlab = "Year", ylab = "Shipments", bty = "l")
이 기계의 제조회사는 대대적인 판촉 캠페인을 벌이기 전에 잠대죈 우수고객을 발견하고자 한다. 특히, 이 제조사는 개별 가구들을 Income(가계소득, 단위:1천 달러)과 Lot Size(대지면적, 단위: 1천 평방피트)를 기준으로 잠재고객 여부(owner/nonowner)를 판별하려고 한다. 마케팅 전문가는 RidingMowers.csv파일 안에 들어있는 24개 랜덤표본가구 데이터를 사용한다.
ridingmowers = read.csv("RidingMowers.csv")
ridingmowers
## Income Lot_Size Ownership
## 1 60.0 18.4 Owner
## 2 85.5 16.8 Owner
## 3 64.8 21.6 Owner
## 4 61.5 20.8 Owner
## 5 87.0 23.6 Owner
## 6 110.1 19.2 Owner
## 7 108.0 17.6 Owner
## 8 82.8 22.4 Owner
## 9 69.0 20.0 Owner
## 10 93.0 20.8 Owner
## 11 51.0 22.0 Owner
## 12 81.0 20.0 Owner
## 13 75.0 19.6 Nonowner
## 14 52.8 20.8 Nonowner
## 15 64.8 17.2 Nonowner
## 16 43.2 20.4 Nonowner
## 17 84.0 17.6 Nonowner
## 18 49.2 17.6 Nonowner
## 19 59.4 16.0 Nonowner
## 20 66.0 18.4 Nonowner
## 21 47.4 16.4 Nonowner
## 22 33.0 18.8 Nonowner
## 23 51.0 14.0 Nonowner
## 24 63.0 14.8 Nonowner
plot(ridingmowers$Income ~ ridingmowers$Lot_Size,
main = "Relationship between Lot Size and Income",
xlab = "Lot Size",
ylab = "Income",
col = ifelse(grepl("Owner",ridingmowers$Ownership)==TRUE,"black","red"),
pch = ifelse(grepl("Owner",ridingmowers$Ownership)==TRUE,1,2))
legend("topright", inset=c(0,0),
legend = c("Owner","Nonowner"), col = c("black","red"), pch = 1, cex = 0.8)
LaptopSalesJanuary.csv 파일은 런던 소재의 한 컴퓨터 체인점의 2008년 1월 매출데이터이다. 이것을 2008년도 전체 매출데이터의 일부이다.
laptopsales = read.csv("LaptopSalesJanuary2008.csv")
names(laptopsales)
## [1] "Date" "Configuration"
## [3] "Customer.Postcode" "Store.Postcode"
## [5] "Retail.Price" "Screen.Size..Inches."
## [7] "Battery.Life..Hours." "RAM..GB."
## [9] "Processor.Speeds..GHz." "Integrated.Wireless."
## [11] "HD.Size..GB." "Bundled.Applications."
## [13] "OS.X.Customer" "OS.Y.Customer"
## [15] "OS.X.Store" "OS.Y.Store"
## [17] "CustomerStoreDistance"
average_retail_price_by_store = aggregate( Retail.Price ~ Store.Postcode, laptopsales, FUN = mean)
average_retail_price_by_store
## Store.Postcode Retail.Price
## 1 CR7 8LE 488.6190
## 2 E2 0RY 483.1717
## 3 E7 8NW 494.3814
## 4 KT2 5AU 493.9048
## 5 N17 6QA 494.6341
## 6 N3 1DH 487.3684
## 7 NW5 2QH 486.5805
## 8 S1P 3AU 486.2500
## 9 SE1 2BN 486.6802
## 10 SE8 3JD 492.1778
## 11 SW12 9HD 485.2957
## 12 SW18 1NN 493.0389
## 13 SW1P 3AU 488.5069
## 14 SW1V 4QQ 489.3450
## 15 W10 6HQ 489.8667
## 16 W4 3PH 481.0063
ordered_data = average_retail_price_by_store[order(average_retail_price_by_store$Retail.Price),]
barplot(height = ordered_data$Retail.Price, names.arg = ordered_data$Store.Postcode,
xlab = "Store Postcode", ylab = "Average Retail Price",
cex.names = 0.5, ylim = c(0,500), las = 3,
main = "Average Retail Price by Store ")
for ( i in 1:length(average_retail_price_by_store$Retail.Price)){
if (average_retail_price_by_store$Retail.Price[i] == max(average_retail_price_by_store$Retail.Price)){
print(average_retail_price_by_store$Store.Postcode[i])
}
if (average_retail_price_by_store$Retail.Price[i] == min(average_retail_price_by_store$Retail.Price)){
print(average_retail_price_by_store$Store.Postcode[i])
}
}
## [1] N17 6QA
## 16 Levels: CR7 8LE E2 0RY E7 8NW KT2 5AU N17 6QA N3 1DH ... W4 3PH
## [1] W4 3PH
## 16 Levels: CR7 8LE E2 0RY E7 8NW KT2 5AU N17 6QA N3 1DH ... W4 3PH
-> 평균 소매가격이 494.6341인 “N17 6QA”의 주소를 가진 매장이 가장 높은 평균 소매가격을 보여주었으며, 481.0063인 “W4 3PH”의 주소를 가진 매장이 가장 낮은 평균 소매가격을 보여주었습니다.
boxplot(laptopsales$Retail.Price ~ laptopsales$Store.Postcode,
main = "Boxplot for 16 stores Retail Price",
xlab = "Store",
ylab = "Retail Price", cex.axis = 0.5,
las=2)
16개의 전체 매장별 박스플랏 비교
minimum = subset(laptopsales, Store.Postcode == "W4 3PH")
maximum = subset(laptopsales, Store.Postcode == "N17 6QA")
min_max = rbind(minimum, maximum)
min_max$Store.Postcode = factor(min_max$Store.Postcode)
boxplot(min_max$Retail.Price ~ min_max$Store.Postcode,
main = "Comparison between Maximum and Mimum",
xlab = "Store",
ylab = "Retail Price",cex.names = 0.5,
las=2)
summary(minimum$Retail.Price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 300.0 442.5 480.0 481.0 515.0 665.0
summary(maximum$Retail.Price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 320.0 462.5 495.0 494.6 535.0 645.0
(a)에서 찾은 두 매장의 가격 비교: 매장의 평균 소매가가 가장 높은 N17 6QA 매장은 평균 소매가가 가장 낮은 W4 3PH 과 IQR 값이 72.5로 같음을 알 수 있습니다. 하지만, W4 3PH의 중앙값은 480이고, N17 6QA의 중앙값은 480임을 알 수 있습니다. 또한, N17 6QA는 이상치가 적게 존재하고, W4 3PH는 이상치가 양쪽으로 많게 존재하고 있습니다.
이 과제는 대화형 시각화 툴을 사용하도록 준비되었다. Laptop.txt라는 파일은 30만 행에 달하는 콤마로 분리된 파일이다. 이데이터는 2009년 가을 거행된 콘테스트를 위해 ENBIS(the European Network for Business and Industrial Statistic)가 제공하였다.
시나리오: 여러분이 노트북 컴퓨터를 판매하는 Acell이라는 회사의 데이터 분석가라고 가정하자. 여러분에게 제품과 판매에 대한 데이터가 제공되었다. 여러분에게 주어진 과제는 2009년도 ACELL사의 예상매출을 극대화하는 제품전략과 가격정책을 만드는 것이다. 대화형 시각화 툴을 사용하여 아래 질문에 답하시오.
laptop_data = read.csv("LaptopSales.csv")
head(laptop_data)
## Date Configuration Customer.Postcode Store.Postcode
## 1 1/1/2008 0:01 163 EC4V 5BH SE1 2BN
## 2 1/1/2008 0:02 320 SW4 0JL SW12 9HD
## 3 1/1/2008 0:04 23 EC3V 1LR E2 0RY
## 4 1/1/2008 0:04 169 SW1P 3AU SE1 2BN
## 5 1/1/2008 0:06 365 EC4V 4EG SW1V 4QQ
## 6 1/1/2008 0:12 309 W1B 5PX SW1V 4QQ
## Retail.Price Screen.Size..Inches. Battery.Life..Hours. RAM..GB.
## 1 455 15 5 1
## 2 545 15 6 1
## 3 515 15 4 1
## 4 395 15 5 1
## 5 585 15 6 2
## 6 555 15 6 1
## Processor.Speeds..GHz. Integrated.Wireless. HD.Size..GB.
## 1 2 Yes 80
## 2 2 No 300
## 3 2 Yes 300
## 4 2 No 40
## 5 2 No 120
## 6 2 Yes 120
## Bundled.Applications. customer.X customer.Y store.X store.Y
## 1 Yes 532041 180995 534057 179682
## 2 No 529240 175537 528739 173080
## 3 Yes 533095 181047 535652 182961
## 4 Yes 529902 179641 534057 179682
## 5 Yes 531684 180948 528924 178440
## 6 Yes 529207 180969 528924 178440
tail(laptop_data)
## Date Configuration Customer.Postcode Store.Postcode
## 297567 12/30/2008 23:54 854 SW19 3NW SW12 9HD
## 297568 12/30/2008 23:55 703 SE1 2UP SW1P 3AU
## 297569 12/30/2008 23:55 731 N13 4JD N3 1DH
## 297570 12/30/2008 23:57 375 SE25 6EF CR7 8LE
## 297571 12/30/2008 23:58 101 SW8 1LA SW1P 3AU
## 297572 12/30/2008 23:58 343 SE16 4QZ SE8 3JD
## Retail.Price Screen.Size..Inches. Battery.Life..Hours. RAM..GB.
## 297567 780 17 6 4
## 297568 NA 17 5 4
## 297569 392 17 6 1
## 297570 441 15 6 2
## 297571 406 15 4 4
## 297572 530 15 6 2
## Processor.Speeds..GHz. Integrated.Wireless. HD.Size..GB.
## 297567 2.4 Yes 120
## 297568 2.0 No 300
## 297569 1.5 No 80
## 297570 2.4 Yes 300
## 297571 1.5 Yes 120
## 297572 1.5 Yes 300
## Bundled.Applications. customer.X customer.Y store.X store.Y
## 297567 No 524772 169692 528739 173080
## 297568 Yes 533595 180025 529902 179641
## 297569 Yes 531165 192973 525109 190628
## 297570 Yes 534086 168521 532714 168302
## 297571 Yes 530716 177488 529902 179641
## 297572 Yes 534348 179162 537175 177885
sum(is.na(laptop_data$Retail.Price))
## [1] 13443
매출액 데이터에 결측값이 있음을 상위 6개, 하위 6개 데이터를 보면서 알 수 있었습니다. 결측치의 존재여부를 파악후, 결측치의 갯수를 계산한 결과 13443개의 매출액 결측치가 있었음을 알 수 있었습니다.
hist(laptop_data$Retail.Price ,main="Retail Price", xlab = "Retail Price", ylab = "Frequency",probability = TRUE, ylim = c(0,0.005))
lines(density(x = laptop_data$Retail.Price, na.rm = T), col="Red")
결측치를 제거하고, Retail.Price의 분포를 히스토그램으로 그려 살펴본 결과, 500파운드 근처에서 가장 밀집되어 있음을 알 수 있었습니다. 하지만,
na_rm_laptop = na.omit(laptop_data)
tmp = matrix(c(rep(0, 3 * max(na_rm_laptop$Configuration))), ncol = 3)
colnames(tmp)= c("Mean","Standard Deviation","Count")
rownames(tmp)=c(1:max(na_rm_laptop$Configuration))
mean_data = as.data.frame(aggregate(laptop_data$Retail.Price, by = list(laptop_data$Configuration), FUN = mean, na.rm = TRUE))
sd_data = as.data.frame(aggregate(laptop_data$Retail.Price, by = list(laptop_data$Configuration), FUN = sd, na.rm = TRUE))
tmp[,1] = mean_data[,2]
tmp[,2] = sd_data[,2]
for (i in 1:max(na_rm_laptop$Configuration)){
tmp[i,3] = nrow(subset(na_rm_laptop, Configuration == i))
}
head(tmp)
## Mean Standard Deviation Count
## 1 337.7628 27.28609 253
## 2 288.5492 24.68505 244
## 3 376.6780 28.20846 264
## 4 329.0316 27.18746 253
## 5 392.0983 34.24757 234
## 6 348.1106 25.69360 226
plot(na_rm_laptop$Retail.Price ~ na_rm_laptop$Configuration, ylim = c(100,800), xlim = c(0,900),pch = 19, col = 1,
main = "Retail Price by Configuration",
xlab = " Configuration",
ylab = "Retail pricfe")
na_rm_laptop$Date = as.POSIXct(na_rm_laptop$Date,format = "%m/%d/%Y")
na_rm_laptop$Date = as.Date(na_rm_laptop$Date)
na_rm_laptop$Date = months(na_rm_laptop$Date, abbreviate = TRUE)
na_rm_laptop$Date = as.integer(na_rm_laptop$Date)
mean_by_month = aggregate(na_rm_laptop$Retail.Price, list(na_rm_laptop$Date), FUN = mean)
ts = ts(mean_by_month$x, start = c(2008,1), end = c(2008,12), freq = 12)
plot(c(1,2,3,4,5,6,7,8,9,10,11,12),ts,type = "o",pch = 19,main = "Retail Price Timeseries", xlab = "Month", ylab = "Average Retail Price", ylim = c(400,600))
axis(1, at=1:12, labels = c(1:12))
ordered_by_counts = tmp[order(tmp[,3], decreasing = T),]
head(ordered_by_counts)
## Mean Standard Deviation Count
## 61 388.8812 55.02394 825
## 345 430.1907 54.56083 816
## 207 469.2224 56.24305 814
## 53 411.4321 52.11270 810
## 340 439.5771 54.96629 804
## 63 447.9824 55.89726 795
na_rm_laptop = na.omit(laptop_data)
na_rm_laptop$Date = as.POSIXct(na_rm_laptop$Date,format = "%m/%d/%Y")
na_rm_laptop$Date = as.Date(na_rm_laptop$Date)
mean_by_month_day = aggregate(na_rm_laptop$Retail.Price, list(na_rm_laptop$Date), FUN = mean)
mean_by_month_day$Group.1 = months(mean_by_month_day$Group.1)
jan = subset(mean_by_month_day, Group.1 == "1월")
feb = subset(mean_by_month_day, Group.1 == "2월")
mar = subset(mean_by_month_day, Group.1 == "3월")
apr = subset(mean_by_month_day, Group.1 == "4월")
may = subset(mean_by_month_day, Group.1 == "5월")
jun = subset(mean_by_month_day, Group.1 == "6월")
jul= subset(mean_by_month_day, Group.1 == "7월")
aug = subset(mean_by_month_day, Group.1 == "8월")
sep = subset(mean_by_month_day, Group.1 == "9월")
oct = subset(mean_by_month_day, Group.1 == "10월")
nov = subset(mean_by_month_day, Group.1 == "11월")
dec = subset(mean_by_month_day, Group.1 == "12월")
plot(c(1:31),jan$x, type = "o", ylim = c(400,600), main = " Retail Price by Day", xlab = "Day", ylab = "Retail Price", pch = 19, xaxt = "n")
lines(c(1:29),feb$x, col = 2, type = "o", pch = 19)
lines(c(1:31),mar$x, col = 3, type = "o", pch = 19)
lines(c(1:30),apr$x, col = 4, type = "o", pch = 19)
lines(c(1:31),may$x, col = 5, type = "o", pch = 19)
lines(c(1:30),jun$x, col = 6, type = "o", pch = 19)
lines(c(1:31),jul$x, col = 7, type = "o", pch = 19)
lines(c(1:31),aug$x, col = 8, type = "o", pch = 19)
lines(c(1:30),sep$x, col = 9, type = "o", pch = 19)
lines(c(1:31),oct$x, col = 10, type = "o", pch = 19)
lines(c(1:30),nov$x, col = 11, type = "o", pch = 19)
lines(c(1:30),dec$x, col = 12, type = "o", pch = 19)
axis(1, at=1:31, labels = c(1:31))
legend(32,610, c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"),pch = 19, cex=0.5, col=colors, lty=1, title="Month", xpd=NA)
na_rm_laptop = na.omit(laptop_data)
na_rm_laptop$Date = as.POSIXct(na_rm_laptop$Date,format = "%m/%d/%Y")
na_rm_laptop$Date = as.Date(na_rm_laptop$Date)
na_rm_laptop$Date = months(na_rm_laptop$Date, abbreviate = TRUE)
na_rm_laptop$Date = as.integer(na_rm_laptop$Date)
mean_by_month = aggregate(na_rm_laptop$Retail.Price, list(na_rm_laptop$Date), FUN = mean)
top1 = subset(na_rm_laptop, Configuration == 61 )
top2 = subset(na_rm_laptop, Configuration == 345 )
top3 = subset(na_rm_laptop, Configuration == 207)
mean_by_month_top1 = aggregate(top1$Retail.Price,list(top1$Date), FUN = mean)
mean_by_month_top2 = aggregate(top2$Retail.Price,list(top2$Date), FUN = mean)
mean_by_month_top3 = aggregate(top3$Retail.Price,list(top3$Date), FUN = mean)
ts1 = ts(mean_by_month_top1$x, start = c(2008,1), end = c(2008,12), freq = 12)
ts2 = ts(mean_by_month_top2$x, start = c(2008,1), end = c(2008,12), freq = 12)
ts3 = ts(mean_by_month_top3$x, start = c(2008,1), end = c(2008,12), freq = 12)
plot(c(1:12),ts1,type = "l" ,main = "TOP 3 Items' Average Retail Price", xlab = "Month", ylab = "Average Retail Price", ylim = c(300,600))
lines(c(1:12), ts2, col = 2)
lines(c(1:12), ts3, col = 3)
colors = c(1:3)
legend(10,600, c(61,345,207),pch = 19, cex=0.8, col=colors, lty=1, title="Configuration", xpd=NA)
상위 3개 제품별 평균 매출가격을 시계열
nrow(subset(laptop_data, Configuration == 72))
## [1] 821
sum(is.na(subset(laptop_data, Configuration == 72)$Retail.Price))
## [1] 82
판매가격은 각 매장별로 일관성이 있는가?
판매 가격은 컴퓨터 사양에 따라 어떻게 다른가?
매장의 위치와 고객의 위치는 어디인가?
고객이 매장을 방문하기 위해 얼마나 이동하는지 다른 방법을 통해 알아보자. 고객과 매장 간의 이동거리를 담은 새로운 데이터 열을 만들어서 이 문제를 풀어보자
4.8절에 있는 아침식사용 시리얼 예제의 데이터를 사용하여 아래와 같이 데이터를 탐색하고 요약하시오.
cereals = read.csv("Cereals.csv")
head(cereals)
## name mfr type calories protein fat sodium fiber
## 1 100%_Bran N C 70 4 1 130 10.0
## 2 100%_Natural_Bran Q C 120 3 5 15 2.0
## 3 All-Bran K C 70 4 1 260 9.0
## 4 All-Bran_with_Extra_Fiber K C 50 4 0 140 14.0
## 5 Almond_Delight R C 110 2 2 200 1.0
## 6 Apple_Cinnamon_Cheerios G C 110 2 2 180 1.5
## carbo sugars potass vitamins shelf weight cups rating
## 1 5.0 6 280 25 3 1 0.33 68.40297
## 2 8.0 8 135 0 3 1 1.00 33.98368
## 3 7.0 5 320 25 3 1 0.33 59.42551
## 4 8.0 0 330 25 3 1 0.50 93.70491
## 5 14.0 8 NA 25 3 1 0.75 34.38484
## 6 10.5 10 70 25 1 1 0.75 29.50954
str(cereals)
## 'data.frame': 77 obs. of 16 variables:
## $ name : Factor w/ 77 levels "100%_Bran","100%_Natural_Bran",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ mfr : Factor w/ 7 levels "A","G","K","N",..: 4 6 3 3 7 2 3 2 7 5 ...
## $ type : Factor w/ 2 levels "C","H": 1 1 1 1 1 1 1 1 1 1 ...
## $ calories: int 70 120 70 50 110 110 110 130 90 90 ...
## $ protein : int 4 3 4 4 2 2 2 3 2 3 ...
## $ fat : int 1 5 1 0 2 2 0 2 1 0 ...
## $ sodium : int 130 15 260 140 200 180 125 210 200 210 ...
## $ fiber : num 10 2 9 14 1 1.5 1 2 4 5 ...
## $ carbo : num 5 8 7 8 14 10.5 11 18 15 13 ...
## $ sugars : int 6 8 5 0 8 10 14 8 6 5 ...
## $ potass : int 280 135 320 330 NA 70 30 100 125 190 ...
## $ vitamins: int 25 0 25 25 25 25 25 25 25 25 ...
## $ shelf : int 3 3 3 3 3 1 2 3 1 3 ...
## $ weight : num 1 1 1 1 1 1 1 1.33 1 1 ...
## $ cups : num 0.33 1 0.33 0.5 0.75 0.75 1 0.75 0.67 0.67 ...
## $ rating : num 68.4 34 59.4 93.7 34.4 ...
양적/수치적 변수: calories, protein, fat, sodium, fiber, carbo, sugers, potass, vitamins, weight, cups, rating 이 있습니다. 순서형 변수: shelf가 있습니다. 명목형 변수: name, mfr, type가 있습니다.
quan_data = cereals[,4:16]
quan_data = quan_data[,-10]
summary_data = data.frame(mean = sapply(quan_data, mean,na.rm = TRUE),
median = sapply(quan_data, median,na.rm = TRUE),
min = sapply(quan_data, min,na.rm = TRUE),
max = sapply(quan_data, max,na.rm = TRUE),
sd = sapply(quan_data, sd,na.rm = TRUE))
summary_data
## mean median min max sd
## calories 106.883117 110.00000 50.00000 160.00000 19.4841191
## protein 2.545455 3.00000 1.00000 6.00000 1.0947897
## fat 1.012987 1.00000 0.00000 5.00000 1.0064726
## sodium 159.675325 180.00000 0.00000 320.00000 83.8322952
## fiber 2.151948 2.00000 0.00000 14.00000 2.3833640
## carbo 14.802632 14.50000 5.00000 23.00000 3.9073256
## sugars 7.026316 7.00000 0.00000 15.00000 4.3786564
## potass 98.666667 90.00000 15.00000 330.00000 70.4106360
## vitamins 28.246753 25.00000 0.00000 100.00000 22.3425225
## weight 1.029610 1.00000 0.50000 1.50000 0.1504768
## cups 0.821039 0.75000 0.25000 1.50000 0.2327161
## rating 42.665705 40.40021 18.04285 93.70491 14.0472887
par(mfcol=c(3,4))
hist(quan_data$calories, main = "Calories Histogram", xlab = "Calories")
hist(quan_data$protein, main = "Protein Histogram", xlab = "Protein")
hist(quan_data$fat, main = "Fat Histogram", xlab = "Fat")
hist(quan_data$sodium, main = "Sodium Histogram", xlab = "Sodium")
hist(quan_data$fiber, main = "Fiber Histogram", xlab = "Fiber")
hist(quan_data$carbo, main = "Carbo Histogram", xlab = "Carbo")
hist(quan_data$sugars, main = "Sugars Histogram", xlab = "Sugars")
hist(quan_data$potass, main = "Potass Histogram", xlab = "Potass")
hist(quan_data$vitamins, main = "Vitamins Histogram", xlab = "Vitamins")
hist(quan_data$weight, main = "Weight Histogram", xlab = "Weights")
hist(quan_data$cups, main = "Cups Histogram", xlab = "Cups")
hist(quan_data$rating, main = "Rating Histogram", xlab = "Rating")
summary_data
## mean median min max sd
## calories 106.883117 110.00000 50.00000 160.00000 19.4841191
## protein 2.545455 3.00000 1.00000 6.00000 1.0947897
## fat 1.012987 1.00000 0.00000 5.00000 1.0064726
## sodium 159.675325 180.00000 0.00000 320.00000 83.8322952
## fiber 2.151948 2.00000 0.00000 14.00000 2.3833640
## carbo 14.802632 14.50000 5.00000 23.00000 3.9073256
## sugars 7.026316 7.00000 0.00000 15.00000 4.3786564
## potass 98.666667 90.00000 15.00000 330.00000 70.4106360
## vitamins 28.246753 25.00000 0.00000 100.00000 22.3425225
## weight 1.029610 1.00000 0.50000 1.50000 0.1504768
## cups 0.821039 0.75000 0.25000 1.50000 0.2327161
## rating 42.665705 40.40021 18.04285 93.70491 14.0472887
cold_data = subset(cereals, type == "C")
hot_data = subset(cereals, type == "H")
par(mfcol = c(1,2))
boxplot(hot_data$calories, main = "Boxplot Hot-cereals' Calories", xlab = "Hot", ylab = "Calories")
boxplot(cold_data$calories, main = "Boxplot Cold-cereals' Calories", xlab = "Cold", ylab = "Calories")
boxplot(cereals$rating ~ cereals$shelf,
main = "Boxplot for Rating by Shelf levels",
xlab = "Shelf levels",
ylab = "Rating")
sum(is.na(quan_data))
## [1] 4
na_rm_quan_data = na.omit(quan_data)
sum(is.na(na_rm_quan_data))
## [1] 0
round(cor(na_rm_quan_data),2)
## calories protein fat sodium fiber carbo sugars potass vitamins
## calories 1.00 0.03 0.51 0.30 -0.30 0.27 0.57 -0.07 0.26
## protein 0.03 1.00 0.20 0.01 0.51 -0.04 -0.29 0.58 0.05
## fat 0.51 0.20 1.00 0.00 0.01 -0.28 0.29 0.20 -0.03
## sodium 0.30 0.01 0.00 1.00 -0.07 0.33 0.04 -0.04 0.33
## fiber -0.30 0.51 0.01 -0.07 1.00 -0.38 -0.15 0.91 -0.04
## carbo 0.27 -0.04 -0.28 0.33 -0.38 1.00 -0.45 -0.37 0.25
## sugars 0.57 -0.29 0.29 0.04 -0.15 -0.45 1.00 0.00 0.07
## potass -0.07 0.58 0.20 -0.04 0.91 -0.37 0.00 1.00 0.00
## vitamins 0.26 0.05 -0.03 0.33 -0.04 0.25 0.07 0.00 1.00
## weight 0.70 0.23 0.22 0.31 0.25 0.14 0.46 0.42 0.32
## cups 0.09 -0.24 -0.16 0.12 -0.51 0.36 -0.03 -0.50 0.13
## rating -0.69 0.47 -0.41 -0.38 0.60 0.06 -0.76 0.42 -0.21
## weight cups rating
## calories 0.70 0.09 -0.69
## protein 0.23 -0.24 0.47
## fat 0.22 -0.16 -0.41
## sodium 0.31 0.12 -0.38
## fiber 0.25 -0.51 0.60
## carbo 0.14 0.36 0.06
## sugars 0.46 -0.03 -0.76
## potass 0.42 -0.50 0.42
## vitamins 0.32 0.13 -0.21
## weight 1.00 -0.20 -0.30
## cups -0.20 1.00 -0.22
## rating -0.30 -0.22 1.00
panel_for_cor = function(x,y){
usr = par("usr"); on.exit(par(usr))
par(usr = c(0,1,0,1))
r = round(cor(x,y), digit = 2)
txt = paste0( r)
cex.cor = 0.3 /strwidth(txt)
text(0.5,0.5,txt,cex = -abs(cex.cor * r))
}
upper_panel = function(x,y){
points(x,y,pch = 19, cex = 0.2)
}
plot(na_rm_quan_data, upper.panel = upper_panel,
lower.panel = panel_for_cor,
main = "Scatter Matrix for Quantatitive Variables"
)
ToyotaCorolla.csv 파일은 네덜란드에서 2004년 늦여름 동안 판매되었던 중고차(도요타 코롤라)에 대한 데이터이다. 1436개의 레코드에는 각 중고차의 가격(Price), 사용기간(Age), 주행거리(Kilometers), 마력(HP), 및 기타사양을 포함하여 38개의 특성에 관한 세부사항이포함되어 있다. 분석 목표는 사양에 따른 중고 도요타 코롤라의 가격을 예측하는 것이다.
toyota = read.csv("ToyotaCorolla.csv")
str(toyota)
## 'data.frame': 1436 obs. of 39 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Model : Factor w/ 319 levels "TOYOTA Corolla ",..: 276 276 276 276 275 275 269 269 257 246 ...
## $ Price : int 13500 13750 13950 14950 13750 12950 16900 18600 21500 12950 ...
## $ Age_08_04 : int 23 23 24 26 30 32 27 30 27 23 ...
## $ Mfg_Month : int 10 10 9 7 3 1 6 3 6 10 ...
## $ Mfg_Year : int 2002 2002 2002 2002 2002 2002 2002 2002 2002 2002 ...
## $ KM : int 46986 72937 41711 48000 38500 61000 94612 75889 19700 71138 ...
## $ Fuel_Type : Factor w/ 3 levels "CNG","Diesel",..: 2 2 2 2 2 2 2 2 3 2 ...
## $ HP : int 90 90 90 90 90 90 90 90 192 69 ...
## $ Met_Color : int 1 1 1 0 0 0 1 1 0 0 ...
## $ Color : Factor w/ 10 levels "Beige","Black",..: 3 7 3 2 2 9 5 5 6 3 ...
## $ Automatic : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CC : int 2000 2000 2000 2000 2000 2000 2000 2000 1800 1900 ...
## $ Doors : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Cylinders : int 4 4 4 4 4 4 4 4 4 4 ...
## $ Gears : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Quarterly_Tax : int 210 210 210 210 210 210 210 210 100 185 ...
## $ Weight : int 1165 1165 1165 1165 1170 1170 1245 1245 1185 1105 ...
## $ Mfr_Guarantee : int 0 0 1 1 1 0 0 1 0 0 ...
## $ BOVAG_Guarantee : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Guarantee_Period : int 3 3 3 3 3 3 3 3 3 3 ...
## $ ABS : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Airbag_1 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Airbag_2 : int 1 1 1 1 1 1 1 1 0 1 ...
## $ Airco : int 0 1 0 0 1 1 1 1 1 1 ...
## $ Automatic_airco : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Boardcomputer : int 1 1 1 1 1 1 1 1 0 1 ...
## $ CD_Player : int 0 1 0 0 0 0 0 1 0 0 ...
## $ Central_Lock : int 1 1 0 0 1 1 1 1 1 0 ...
## $ Powered_Windows : int 1 0 0 0 1 1 1 1 1 0 ...
## $ Power_Steering : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Radio : int 0 0 0 0 0 0 0 0 1 0 ...
## $ Mistlamps : int 0 0 0 0 1 1 0 0 0 0 ...
## $ Sport_Model : int 0 0 0 0 0 0 1 0 0 0 ...
## $ Backseat_Divider : int 1 1 1 1 1 1 1 1 0 1 ...
## $ Metallic_Rim : int 0 0 0 0 0 0 0 0 1 0 ...
## $ Radio_cassette : int 0 0 0 0 0 0 0 0 1 0 ...
## $ Parking_Assistant: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Tow_Bar : int 0 0 0 0 0 0 0 0 0 0 ...