chapter 3

knitr::opts_knit$set(root.dir = "/Users/MYMAC/Desktop/Dataset")
library("forecast")

3.1 가전제품 출하량: 선그래프.

ApplianceShipments.csv 파일에는 1985년부터 1989년 사이의 미국 가전제품 분기별 shipments(출하량, 단위: 백만 달러) 데이터가 들어있다.

appliance_data = read.csv("ApplianceShipments.csv")
appliance_data

##    Quarter Shipments  X Data.courtesy.Ken.Black
## 1  Q1-1985      4009 NA                      NA
## 2  Q2-1985      4321 NA                      NA
## 3  Q3-1985      4224 NA                      NA
## 4  Q4-1985      3944 NA                      NA
## 5  Q1-1986      4123 NA                      NA
## 6  Q2-1986      4522 NA                      NA
## 7  Q3-1986      4657 NA                      NA
## 8  Q4-1986      4030 NA                      NA
## 9  Q1-1987      4493 NA                      NA
## 10 Q2-1987      4806 NA                      NA
## 11 Q3-1987      4551 NA                      NA
## 12 Q4-1987      4485 NA                      NA
## 13 Q1-1988      4595 NA                      NA
## 14 Q2-1988      4799 NA                      NA
## 15 Q3-1988      4417 NA                      NA
## 16 Q4-1988      4258 NA                      NA
## 17 Q1-1989      4245 NA                      NA
## 18 Q2-1989      4900 NA                      NA
## 19 Q3-1989      4585 NA                      NA
## 20 Q4-1989      4533 NA                      NA

불필요한 열 2개 제거(3열,4열)를 해주었습니다.

appliance_data = appliance_data[,-3:-4]
appliance_data

##    Quarter Shipments
## 1  Q1-1985      4009
## 2  Q2-1985      4321
## 3  Q3-1985      4224
## 4  Q4-1985      3944
## 5  Q1-1986      4123
## 6  Q2-1986      4522
## 7  Q3-1986      4657
## 8  Q4-1986      4030
## 9  Q1-1987      4493
## 10 Q2-1987      4806
## 11 Q3-1987      4551
## 12 Q4-1987      4485
## 13 Q1-1988      4595
## 14 Q2-1988      4799
## 15 Q3-1988      4417
## 16 Q4-1988      4258
## 17 Q1-1989      4245
## 18 Q2-1989      4900
## 19 Q3-1989      4585
## 20 Q4-1989      4533

시계열 형태의 데이터로 받아줍니다.

appliance_timeseries = ts(appliance_data$Shipments, start = c(1985,1), end = c(1989,4),freq=4)

R을 사용하여 시계열 차트를 그리시오.

plot(appliance_timeseries,main = "Appliance Shipments", xlab = "Year", ylab = "Shipments", ylim = c(3800,5100))

b.분기별 패턴이 보이는가? 패턴을 자세히 보려면 y축 값 3500~5000 범위를 확대해 보시오.

plot(appliance_timeseries,main = "Appliance Shipments", xlab = "Year", ylab = "Shipments", ylim = c(3500,5000))

c.R을 이용하여 Q1,Q2,Q3,Q4에 대해 한 개의 차트에 네 개의 선으로 꺾은선 그래프를 그리시오. R에서는 data.frame을 생성하여 각 분기의 데이터를 담고, 그것을 선 그래프로 그린다. y축 값 3500~5000 범위를 확대해 보시오. 분기별로 차이점이 보이는가?

dataframe 을 생성하여 분기의 데이터를 담아서 만드는 방법

Quater_only = rep(0,20)

tmp = cbind(appliance_data, Quater_only)

for (i in 1:length(rownames(tmp))){
  if (grepl("Q1", tmp[i,1]) == TRUE){
    tmp[i,3] = "Q1"
  }
  if (grepl("Q2", tmp[i,1]) == TRUE){
    tmp[i,3] = "Q2"
  }
  if (grepl("Q3", tmp[i,1]) == TRUE){
    tmp[i,3] = "Q3"
  }
  if (grepl("Q4", tmp[i,1]) == TRUE){
    tmp[i,3] = "Q4"
  }
}

Q1_data = subset(tmp, Quater_only == "Q1")
Q2_data = subset(tmp, Quater_only == "Q2")
Q3_data = subset(tmp, Quater_only == "Q3")
Q4_data = subset(tmp, Quater_only == "Q4")

Q1_timeseries = ts(Q1_data$Shipments, start = c(1985), end = c(1989))
Q2_timeseries = ts(Q2_data$Shipments, start = c(1985), end = c(1989))
Q3_timeseries = ts(Q3_data$Shipments, start = c(1985), end = c(1989))
Q4_timeseries = ts(Q4_data$Shipments, start = c(1985), end = c(1989))

plot(Q1_timeseries,main = "Shipments by Quater", xlab = "Year", ylab = "Shipments", ylim = c(3500,5000), bty = "l",type = "b", lwd = 1.5, lty = 1, col = 1, pch = 1)
lines(Q2_timeseries, type="b", lwd=1.5, lty=2, col=2, pch=2)
lines(Q3_timeseries, type="b", lwd=1.5, lty=3, col=3, pch=3)
lines(Q4_timeseries, type="b", lwd=1.5, lty=4, col=4, pch=4)

colors = c(1:4)
linetype = c(1:4)
plotchar = c(1:4)

legend(1989,4000, 1:4, cex=0.8, col=colors, pch=plotchar, lty=linetype, title="Quarter", xpd=NA)

바로 timeseries를 이용하여 보이는 방법

xrange = c(1985,1989)
yrange = range(3500,5000)
colors = c(1:4)
linetype = c(1:4)
plotchar = c(1:4)
x_cord = c(1985:1989)

plot(xrange, yrange, main = "Shipments by Quarter", type = "n", xlab = "Year", ylab = "Shipments", bty = "l")

for (i in 1:4) {
  current_quarter = subset(appliance_timeseries, cycle(appliance_timeseries) == i)
  lines(x_cord,current_quarter, type="b", lwd=1.5, lty=linetype[i], col=colors[i], pch=plotchar[i])}

legend(1989,4000, 1:4, cex=0.8, col=colors, pch=plotchar, lty=linetype, title="Quarter", xpd=NA)

R을 이용하여 연도별 집계(각 연도의 Shipments 합계)를 구하고 선 그래프를 그리시오.

annual_sum = aggregate(appliance_timeseries, FUN = sum)
xrange = c(1985:1989)
plot(xrange,annual_sum, main = "Sum of Shipments by Annually", type = "l", xlab = "Year", ylab = "Shipments", bty = "l")

3.2 승차식 잔디깎이 기계 매출액: 산점도

이 기계의 제조회사는 대대적인 판촉 캠페인을 벌이기 전에 잠대죈 우수고객을 발견하고자 한다. 특히, 이 제조사는 개별 가구들을 Income(가계소득, 단위:1천 달러)과 Lot Size(대지면적, 단위: 1천 평방피트)를 기준으로 잠재고객 여부(owner/nonowner)를 판별하려고 한다. 마케팅 전문가는 RidingMowers.csv파일 안에 들어있는 24개 랜덤표본가구 데이터를 사용한다.

R을 사용하여 Lot Size와 Income의 관계를 결과변수인 owner/nonowner에 의해 컬러-코드화된 산점도를 그리시오. 형식이 잘 갖춰진 플롯(읽기 쉬운 라벨과 범례를 포함)을 만드시오

ridingmowers = read.csv("RidingMowers.csv")
ridingmowers

##    Income Lot_Size Ownership
## 1    60.0     18.4     Owner
## 2    85.5     16.8     Owner
## 3    64.8     21.6     Owner
## 4    61.5     20.8     Owner
## 5    87.0     23.6     Owner
## 6   110.1     19.2     Owner
## 7   108.0     17.6     Owner
## 8    82.8     22.4     Owner
## 9    69.0     20.0     Owner
## 10   93.0     20.8     Owner
## 11   51.0     22.0     Owner
## 12   81.0     20.0     Owner
## 13   75.0     19.6  Nonowner
## 14   52.8     20.8  Nonowner
## 15   64.8     17.2  Nonowner
## 16   43.2     20.4  Nonowner
## 17   84.0     17.6  Nonowner
## 18   49.2     17.6  Nonowner
## 19   59.4     16.0  Nonowner
## 20   66.0     18.4  Nonowner
## 21   47.4     16.4  Nonowner
## 22   33.0     18.8  Nonowner
## 23   51.0     14.0  Nonowner
## 24   63.0     14.8  Nonowner

plot(ridingmowers$Income ~ ridingmowers$Lot_Size,
     main = "Relationship between Lot Size and Income",
     xlab = "Lot Size",
     ylab = "Income",
     col = ifelse(grepl("Owner",ridingmowers$Ownership)==TRUE,"black","red"),
     pch = ifelse(grepl("Owner",ridingmowers$Ownership)==TRUE,1,2))

legend("topright", inset=c(0,0),
       legend = c("Owner","Nonowner"), col = c("black","red"), pch = 1, cex = 0.8)

Owner일 경우 검은색, Nonowner일 경우 빨간색으로 컬러-코드화를 하였습니다. 그런데, 프린트할때 색깔이 흑백으로만 나오기에 동그라미와 세모로 한번더 다르게 보여주었습니다.

3.3 런던 컴퓨터 체인점의 노트북 판매실적: 막대차트와 박스플롯.

LaptopSalesJanuary.csv 파일은 런던 소재의 한 컴퓨터 체인점의 2008년 1월 매출데이터이다. 이것을 2008년도 전체 매출데이터의 일부이다.

매장별 평균 소매가격을 보여주는 막대차트를 그리시오. 평균 소매가격이 가장 높은 매장을 어느 곳인가? 반대로 가장 낮은 평균 소매가격은 어떤 매장인가?

laptopsales = read.csv("LaptopSalesJanuary2008.csv")
names(laptopsales)

##  [1] "Date"                   "Configuration"         
##  [3] "Customer.Postcode"      "Store.Postcode"        
##  [5] "Retail.Price"           "Screen.Size..Inches."  
##  [7] "Battery.Life..Hours."   "RAM..GB."              
##  [9] "Processor.Speeds..GHz." "Integrated.Wireless."  
## [11] "HD.Size..GB."           "Bundled.Applications." 
## [13] "OS.X.Customer"          "OS.Y.Customer"         
## [15] "OS.X.Store"             "OS.Y.Store"            
## [17] "CustomerStoreDistance"

average_retail_price_by_store = aggregate( Retail.Price ~ Store.Postcode, laptopsales, FUN = mean)
average_retail_price_by_store

##    Store.Postcode Retail.Price
## 1         CR7 8LE     488.6190
## 2          E2 0RY     483.1717
## 3          E7 8NW     494.3814
## 4         KT2 5AU     493.9048
## 5         N17 6QA     494.6341
## 6          N3 1DH     487.3684
## 7         NW5 2QH     486.5805
## 8         S1P 3AU     486.2500
## 9         SE1 2BN     486.6802
## 10        SE8 3JD     492.1778
## 11       SW12 9HD     485.2957
## 12       SW18 1NN     493.0389
## 13       SW1P 3AU     488.5069
## 14       SW1V 4QQ     489.3450
## 15        W10 6HQ     489.8667
## 16         W4 3PH     481.0063

ordered_data = average_retail_price_by_store[order(average_retail_price_by_store$Retail.Price),]
barplot(height = ordered_data$Retail.Price, names.arg =  ordered_data$Store.Postcode,
        xlab = "Store Postcode", ylab = "Average Retail Price",
        cex.names = 0.5, ylim = c(0,500), las = 3,
        main = "Average Retail Price by Store ")

for ( i in 1:length(average_retail_price_by_store$Retail.Price)){
  if (average_retail_price_by_store$Retail.Price[i] == max(average_retail_price_by_store$Retail.Price)){
    print(average_retail_price_by_store$Store.Postcode[i])
  }
  if (average_retail_price_by_store$Retail.Price[i] == min(average_retail_price_by_store$Retail.Price)){
    print(average_retail_price_by_store$Store.Postcode[i])
  }
}

## [1] N17 6QA
## 16 Levels: CR7 8LE E2 0RY E7 8NW KT2 5AU N17 6QA N3 1DH ... W4 3PH
## [1] W4 3PH
## 16 Levels: CR7 8LE E2 0RY E7 8NW KT2 5AU N17 6QA N3 1DH ... W4 3PH

-> 평균 소매가격이 494.6341인 “N17 6QA”의 주소를 가진 매장이 가장 높은 평균 소매가격을 보여주었으며, 481.0063인 “W4 3PH”의 주소를 가진 매장이 가장 낮은 평균 소매가격을 보여주었습니다.

매장별 소매가격을 더 잘 비교하려면 병렬 박스플롯을 그리시오. (a)에서 찾은 두 매장의 가격을 비교해보시오. 두 매장의 가격분포가 어떤 차이점이 있는가?

boxplot(laptopsales$Retail.Price ~ laptopsales$Store.Postcode,
        main = "Boxplot for 16 stores Retail Price",
        xlab = "Store",
        ylab = "Retail Price", cex.axis = 0.5,
        las=2)

16개의 전체 매장별 박스플랏 비교

minimum = subset(laptopsales, Store.Postcode == "W4 3PH")
maximum = subset(laptopsales, Store.Postcode == "N17 6QA")
min_max = rbind(minimum, maximum)
min_max$Store.Postcode = factor(min_max$Store.Postcode)


boxplot(min_max$Retail.Price ~ min_max$Store.Postcode,
        main = "Comparison between Maximum and Mimum",
        xlab = "Store",
        ylab = "Retail Price",cex.names = 0.5,
        las=2)

summary(minimum$Retail.Price)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   300.0   442.5   480.0   481.0   515.0   665.0

summary(maximum$Retail.Price)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   320.0   462.5   495.0   494.6   535.0   645.0

(a)에서 찾은 두 매장의 가격 비교: 매장의 평균 소매가가 가장 높은 N17 6QA 매장은 평균 소매가가 가장 낮은 W4 3PH 과 IQR 값이 72.5로 같음을 알 수 있습니다. 하지만, W4 3PH의 중앙값은 480이고, N17 6QA의 중앙값은 480임을 알 수 있습니다. 또한, N17 6QA는 이상치가 적게 존재하고, W4 3PH는 이상치가 양쪽으로 많게 존재하고 있습니다.

3.4 런던 컴퓨터 체인점의 노트북 판매실적: 대화형 시각화.

이 과제는 대화형 시각화 툴을 사용하도록 준비되었다. Laptop.txt라는 파일은 30만 행에 달하는 콤마로 분리된 파일이다. 이데이터는 2009년 가을 거행된 콘테스트를 위해 ENBIS(the European Network for Business and Industrial Statistic)가 제공하였다.

시나리오: 여러분이 노트북 컴퓨터를 판매하는 Acell이라는 회사의 데이터 분석가라고 가정하자. 여러분에게 제품과 판매에 대한 데이터가 제공되었다. 여러분에게 주어진 과제는 2009년도 ACELL사의 예상매출을 극대화하는 제품전략과 가격정책을 만드는 것이다. 대화형 시각화 툴을 사용하여 아래 질문에 답하시오.

가격에 관한 질문

laptop_data = read.csv("LaptopSales.csv")
head(laptop_data)

##            Date Configuration Customer.Postcode Store.Postcode
## 1 1/1/2008 0:01           163          EC4V 5BH        SE1 2BN
## 2 1/1/2008 0:02           320           SW4 0JL       SW12 9HD
## 3 1/1/2008 0:04            23          EC3V 1LR         E2 0RY
## 4 1/1/2008 0:04           169          SW1P 3AU        SE1 2BN
## 5 1/1/2008 0:06           365          EC4V 4EG       SW1V 4QQ
## 6 1/1/2008 0:12           309           W1B 5PX       SW1V 4QQ
##   Retail.Price Screen.Size..Inches. Battery.Life..Hours. RAM..GB.
## 1          455                   15                    5        1
## 2          545                   15                    6        1
## 3          515                   15                    4        1
## 4          395                   15                    5        1
## 5          585                   15                    6        2
## 6          555                   15                    6        1
##   Processor.Speeds..GHz. Integrated.Wireless. HD.Size..GB.
## 1                      2                  Yes           80
## 2                      2                   No          300
## 3                      2                  Yes          300
## 4                      2                   No           40
## 5                      2                   No          120
## 6                      2                  Yes          120
##   Bundled.Applications. customer.X customer.Y store.X store.Y
## 1                   Yes     532041     180995  534057  179682
## 2                    No     529240     175537  528739  173080
## 3                   Yes     533095     181047  535652  182961
## 4                   Yes     529902     179641  534057  179682
## 5                   Yes     531684     180948  528924  178440
## 6                   Yes     529207     180969  528924  178440

tail(laptop_data)

##                    Date Configuration Customer.Postcode Store.Postcode
## 297567 12/30/2008 23:54           854          SW19 3NW       SW12 9HD
## 297568 12/30/2008 23:55           703           SE1 2UP       SW1P 3AU
## 297569 12/30/2008 23:55           731           N13 4JD         N3 1DH
## 297570 12/30/2008 23:57           375          SE25 6EF        CR7 8LE
## 297571 12/30/2008 23:58           101           SW8 1LA       SW1P 3AU
## 297572 12/30/2008 23:58           343          SE16 4QZ        SE8 3JD
##        Retail.Price Screen.Size..Inches. Battery.Life..Hours. RAM..GB.
## 297567          780                   17                    6        4
## 297568           NA                   17                    5        4
## 297569          392                   17                    6        1
## 297570          441                   15                    6        2
## 297571          406                   15                    4        4
## 297572          530                   15                    6        2
##        Processor.Speeds..GHz. Integrated.Wireless. HD.Size..GB.
## 297567                    2.4                  Yes          120
## 297568                    2.0                   No          300
## 297569                    1.5                   No           80
## 297570                    2.4                  Yes          300
## 297571                    1.5                  Yes          120
## 297572                    1.5                  Yes          300
##        Bundled.Applications. customer.X customer.Y store.X store.Y
## 297567                    No     524772     169692  528739  173080
## 297568                   Yes     533595     180025  529902  179641
## 297569                   Yes     531165     192973  525109  190628
## 297570                   Yes     534086     168521  532714  168302
## 297571                   Yes     530716     177488  529902  179641
## 297572                   Yes     534348     179162  537175  177885

sum(is.na(laptop_data$Retail.Price))

## [1] 13443

매출액 데이터에 결측값이 있음을 상위 6개, 하위 6개 데이터를 보면서 알 수 있었습니다. 결측치의 존재여부를 파악후, 결측치의 갯수를 계산한 결과 13443개의 매출액 결측치가 있었음을 알 수 있었습니다.

hist(laptop_data$Retail.Price ,main="Retail Price", xlab = "Retail Price", ylab = "Frequency",probability = TRUE, ylim = c(0,0.005))
lines(density(x = laptop_data$Retail.Price, na.rm = T), col="Red")

실제로 노트북은 얼마에 판매되었는가?

결측치를 제거하고, Retail.Price의 분포를 히스토그램으로 그려 살펴본 결과, 500파운드 근처에서 가장 밀집되어 있음을 알 수 있었습니다. 하지만,

na_rm_laptop = na.omit(laptop_data)
tmp = matrix(c(rep(0, 3 * max(na_rm_laptop$Configuration))), ncol = 3)
colnames(tmp)= c("Mean","Standard Deviation","Count")
rownames(tmp)=c(1:max(na_rm_laptop$Configuration))
mean_data = as.data.frame(aggregate(laptop_data$Retail.Price, by = list(laptop_data$Configuration), FUN = mean, na.rm = TRUE))
sd_data = as.data.frame(aggregate(laptop_data$Retail.Price, by = list(laptop_data$Configuration), FUN = sd, na.rm = TRUE))

tmp[,1] = mean_data[,2]
tmp[,2] = sd_data[,2]

for (i in 1:max(na_rm_laptop$Configuration)){
  tmp[i,3] = nrow(subset(na_rm_laptop, Configuration == i))
}

head(tmp)

##       Mean Standard Deviation Count
## 1 337.7628           27.28609   253
## 2 288.5492           24.68505   244
## 3 376.6780           28.20846   264
## 4 329.0316           27.18746   253
## 5 392.0983           34.24757   234
## 6 348.1106           25.69360   226

plot(na_rm_laptop$Retail.Price ~ na_rm_laptop$Configuration, ylim = c(100,800), xlim = c(0,900),pch = 19, col = 1,
     main = "Retail Price by Configuration",
     xlab = " Configuration",
     ylab = "Retail pricfe")

시간에 따라서 판매 가격이 변화하였는가?

na_rm_laptop$Date = as.POSIXct(na_rm_laptop$Date,format = "%m/%d/%Y")
na_rm_laptop$Date = as.Date(na_rm_laptop$Date)
na_rm_laptop$Date = months(na_rm_laptop$Date, abbreviate = TRUE)
na_rm_laptop$Date = as.integer(na_rm_laptop$Date)
mean_by_month = aggregate(na_rm_laptop$Retail.Price, list(na_rm_laptop$Date), FUN = mean)

ts = ts(mean_by_month$x, start = c(2008,1), end = c(2008,12), freq = 12)

plot(c(1,2,3,4,5,6,7,8,9,10,11,12),ts,type = "o",pch = 19,main = "Retail Price Timeseries", xlab = "Month", ylab = "Average Retail Price", ylim = c(400,600))
axis(1, at=1:12, labels = c(1:12))

ordered_by_counts = tmp[order(tmp[,3], decreasing = T),]
head(ordered_by_counts)

##         Mean Standard Deviation Count
## 61  388.8812           55.02394   825
## 345 430.1907           54.56083   816
## 207 469.2224           56.24305   814
## 53  411.4321           52.11270   810
## 340 439.5771           54.96629   804
## 63  447.9824           55.89726   795

na_rm_laptop = na.omit(laptop_data)
na_rm_laptop$Date = as.POSIXct(na_rm_laptop$Date,format = "%m/%d/%Y")
na_rm_laptop$Date = as.Date(na_rm_laptop$Date)
mean_by_month_day = aggregate(na_rm_laptop$Retail.Price, list(na_rm_laptop$Date), FUN = mean)
mean_by_month_day$Group.1 = months(mean_by_month_day$Group.1)
jan = subset(mean_by_month_day, Group.1 == "1월")
feb = subset(mean_by_month_day, Group.1 == "2월")
mar = subset(mean_by_month_day, Group.1 == "3월")
apr = subset(mean_by_month_day, Group.1 == "4월")
may = subset(mean_by_month_day, Group.1 == "5월")
jun = subset(mean_by_month_day, Group.1 == "6월")
jul= subset(mean_by_month_day, Group.1 == "7월")
aug = subset(mean_by_month_day, Group.1 == "8월")
sep = subset(mean_by_month_day, Group.1 == "9월")
oct = subset(mean_by_month_day, Group.1 == "10월")
nov = subset(mean_by_month_day, Group.1 == "11월")
dec = subset(mean_by_month_day, Group.1 == "12월")

plot(c(1:31),jan$x, type = "o", ylim = c(400,600), main = " Retail Price by Day", xlab = "Day", ylab = "Retail Price", pch = 19, xaxt = "n")
lines(c(1:29),feb$x, col = 2, type = "o", pch = 19)
lines(c(1:31),mar$x, col = 3, type = "o", pch = 19)
lines(c(1:30),apr$x, col = 4, type = "o", pch = 19)
lines(c(1:31),may$x, col = 5, type = "o", pch = 19)
lines(c(1:30),jun$x, col = 6, type = "o", pch = 19)
lines(c(1:31),jul$x, col = 7, type = "o", pch = 19)
lines(c(1:31),aug$x, col = 8, type = "o", pch = 19)
lines(c(1:30),sep$x, col = 9, type = "o", pch = 19)
lines(c(1:31),oct$x, col = 10, type = "o", pch = 19)
lines(c(1:30),nov$x, col = 11, type = "o", pch = 19)
lines(c(1:30),dec$x, col = 12, type = "o", pch = 19)
axis(1, at=1:31, labels = c(1:31))

legend(32,610, c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"),pch = 19, cex=0.5, col=colors, lty=1, title="Month", xpd=NA)

na_rm_laptop = na.omit(laptop_data)
na_rm_laptop$Date = as.POSIXct(na_rm_laptop$Date,format = "%m/%d/%Y")
na_rm_laptop$Date = as.Date(na_rm_laptop$Date)
na_rm_laptop$Date = months(na_rm_laptop$Date, abbreviate = TRUE)
na_rm_laptop$Date = as.integer(na_rm_laptop$Date)

mean_by_month = aggregate(na_rm_laptop$Retail.Price, list(na_rm_laptop$Date), FUN = mean)
top1 = subset(na_rm_laptop, Configuration == 61 )
top2 = subset(na_rm_laptop, Configuration == 345 )
top3 = subset(na_rm_laptop, Configuration == 207)
mean_by_month_top1 = aggregate(top1$Retail.Price,list(top1$Date), FUN = mean)
mean_by_month_top2 = aggregate(top2$Retail.Price,list(top2$Date), FUN = mean)
mean_by_month_top3 = aggregate(top3$Retail.Price,list(top3$Date), FUN = mean)
ts1 = ts(mean_by_month_top1$x, start = c(2008,1), end = c(2008,12), freq = 12)
ts2 = ts(mean_by_month_top2$x, start = c(2008,1), end = c(2008,12), freq = 12)
ts3 = ts(mean_by_month_top3$x, start = c(2008,1), end = c(2008,12), freq = 12)
plot(c(1:12),ts1,type = "l" ,main = "TOP 3 Items' Average Retail Price", xlab = "Month", ylab = "Average Retail Price", ylim = c(300,600))
lines(c(1:12), ts2, col = 2)
lines(c(1:12), ts3, col = 3)

colors = c(1:3)
legend(10,600, c(61,345,207),pch = 19, cex=0.8, col=colors, lty=1, title="Configuration", xpd=NA)

상위 3개 제품별 평균 매출가격을 시계열

nrow(subset(laptop_data, Configuration == 72))

## [1] 821

sum(is.na(subset(laptop_data, Configuration == 72)$Retail.Price))

## [1] 82

원래 있던 데이터세트에서의 configuration별 판매량을 3개를 하면 61, 72, 316의 제품번호를 가진 제품들이 상위 3개 품목이 되지만, 결측값들을 제거하고 상위 세개를 뽑았을경우에는, 61,345,27의 제품번호를 가진 제품들이 상위 3개 품목이 됩니다.

판매가격은 각 매장별로 일관성이 있는가?
판매 가격은 컴퓨터 사양에 따라 어떻게 다른가?

매장에 관한 질문

매장의 위치와 고객의 위치는 어디인가?
어느 매장이 가장 많이 판매하는가?
고객은 노트북을 사기 위해서 얼마나 이동하는가?
고객이 매장을 방문하기 위해 얼마나 이동하는지 다른 방법을 통해 알아보자. 고객과 매장 간의 이동거리를 담은 새로운 데이터 열을 만들어서 이 문제를 풀어보자

매출에 관한 질문

각 매장에 매출액이 Acell사 전체 매출액과 어떤 연관성이 있는가?
이 연관성은 노트북 사양에 따라 어떻게 영향을 받는가?

노트북 사양에 관한 질문

각 노트북 사양의 상세한 내용은 무엇인가? 이것은 판매가격과 어떤 관련성이 있는가?
각 매장들이 모든 노트북 사양을 판매하는가?

4.1 아침식사용 시리얼.

4.8절에 있는 아침식사용 시리얼 예제의 데이터를 사용하여 아래와 같이 데이터를 탐색하고 요약하시오.

어느 변수들이 양적/수치적 변수인가? 순서형 변수는 어떤 것인가? 명목형 변수는 어떤 것인가?

cereals = read.csv("Cereals.csv")
head(cereals)

##                        name mfr type calories protein fat sodium fiber
## 1                 100%_Bran   N    C       70       4   1    130  10.0
## 2         100%_Natural_Bran   Q    C      120       3   5     15   2.0
## 3                  All-Bran   K    C       70       4   1    260   9.0
## 4 All-Bran_with_Extra_Fiber   K    C       50       4   0    140  14.0
## 5            Almond_Delight   R    C      110       2   2    200   1.0
## 6   Apple_Cinnamon_Cheerios   G    C      110       2   2    180   1.5
##   carbo sugars potass vitamins shelf weight cups   rating
## 1   5.0      6    280       25     3      1 0.33 68.40297
## 2   8.0      8    135        0     3      1 1.00 33.98368
## 3   7.0      5    320       25     3      1 0.33 59.42551
## 4   8.0      0    330       25     3      1 0.50 93.70491
## 5  14.0      8     NA       25     3      1 0.75 34.38484
## 6  10.5     10     70       25     1      1 0.75 29.50954

str(cereals)

## 'data.frame':    77 obs. of  16 variables:
##  $ name    : Factor w/ 77 levels "100%_Bran","100%_Natural_Bran",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ mfr     : Factor w/ 7 levels "A","G","K","N",..: 4 6 3 3 7 2 3 2 7 5 ...
##  $ type    : Factor w/ 2 levels "C","H": 1 1 1 1 1 1 1 1 1 1 ...
##  $ calories: int  70 120 70 50 110 110 110 130 90 90 ...
##  $ protein : int  4 3 4 4 2 2 2 3 2 3 ...
##  $ fat     : int  1 5 1 0 2 2 0 2 1 0 ...
##  $ sodium  : int  130 15 260 140 200 180 125 210 200 210 ...
##  $ fiber   : num  10 2 9 14 1 1.5 1 2 4 5 ...
##  $ carbo   : num  5 8 7 8 14 10.5 11 18 15 13 ...
##  $ sugars  : int  6 8 5 0 8 10 14 8 6 5 ...
##  $ potass  : int  280 135 320 330 NA 70 30 100 125 190 ...
##  $ vitamins: int  25 0 25 25 25 25 25 25 25 25 ...
##  $ shelf   : int  3 3 3 3 3 1 2 3 1 3 ...
##  $ weight  : num  1 1 1 1 1 1 1 1.33 1 1 ...
##  $ cups    : num  0.33 1 0.33 0.5 0.75 0.75 1 0.75 0.67 0.67 ...
##  $ rating  : num  68.4 34 59.4 93.7 34.4 ...

양적/수치적 변수: calories, protein, fat, sodium, fiber, carbo, sugers, potass, vitamins, weight, cups, rating 이 있습니다. 순서형 변수: shelf가 있습니다. 명목형 변수: name, mfr, type가 있습니다.

각각의 양적 변수에 대해서 평균, 중앙값, 최솟값, 최댓값, 그리고 표준편차를 계산하시오. 이것은 R의 sapply()함수를 통해 계산될 수 있다.(예를들어, sapply(data, mean, na.rm=TRUE))

quan_data = cereals[,4:16]
quan_data = quan_data[,-10]
summary_data = data.frame(mean = sapply(quan_data, mean,na.rm = TRUE),
           median = sapply(quan_data, median,na.rm = TRUE),
           min = sapply(quan_data, min,na.rm = TRUE),
           max = sapply(quan_data, max,na.rm = TRUE),
           sd = sapply(quan_data, sd,na.rm = TRUE))
summary_data

##                mean    median      min       max         sd
## calories 106.883117 110.00000 50.00000 160.00000 19.4841191
## protein    2.545455   3.00000  1.00000   6.00000  1.0947897
## fat        1.012987   1.00000  0.00000   5.00000  1.0064726
## sodium   159.675325 180.00000  0.00000 320.00000 83.8322952
## fiber      2.151948   2.00000  0.00000  14.00000  2.3833640
## carbo     14.802632  14.50000  5.00000  23.00000  3.9073256
## sugars     7.026316   7.00000  0.00000  15.00000  4.3786564
## potass    98.666667  90.00000 15.00000 330.00000 70.4106360
## vitamins  28.246753  25.00000  0.00000 100.00000 22.3425225
## weight     1.029610   1.00000  0.50000   1.50000  0.1504768
## cups       0.821039   0.75000  0.25000   1.50000  0.2327161
## rating    42.665705  40.40021 18.04285  93.70491 14.0472887

R을 이용하여 양적 변수 각각에 대해 히스토그램을 작성하시오. 히스토그램과 요약통계량을 바탕으로 다음 질문에 답하시오.

par(mfcol=c(3,4))
hist(quan_data$calories, main = "Calories Histogram", xlab = "Calories")
hist(quan_data$protein, main = "Protein Histogram", xlab = "Protein")
hist(quan_data$fat, main = "Fat Histogram", xlab = "Fat")
hist(quan_data$sodium, main = "Sodium Histogram", xlab = "Sodium")
hist(quan_data$fiber, main = "Fiber Histogram", xlab = "Fiber")
hist(quan_data$carbo, main = "Carbo Histogram", xlab = "Carbo")
hist(quan_data$sugars, main = "Sugars Histogram", xlab = "Sugars")
hist(quan_data$potass, main = "Potass Histogram", xlab = "Potass")
hist(quan_data$vitamins, main = "Vitamins Histogram", xlab = "Vitamins")
hist(quan_data$weight, main = "Weight Histogram", xlab = "Weights")
hist(quan_data$cups, main = "Cups Histogram", xlab = "Cups")
hist(quan_data$rating, main = "Rating Histogram", xlab = "Rating")

summary_data

##                mean    median      min       max         sd
## calories 106.883117 110.00000 50.00000 160.00000 19.4841191
## protein    2.545455   3.00000  1.00000   6.00000  1.0947897
## fat        1.012987   1.00000  0.00000   5.00000  1.0064726
## sodium   159.675325 180.00000  0.00000 320.00000 83.8322952
## fiber      2.151948   2.00000  0.00000  14.00000  2.3833640
## carbo     14.802632  14.50000  5.00000  23.00000  3.9073256
## sugars     7.026316   7.00000  0.00000  15.00000  4.3786564
## potass    98.666667  90.00000 15.00000 330.00000 70.4106360
## vitamins  28.246753  25.00000  0.00000 100.00000 22.3425225
## weight     1.029610   1.00000  0.50000   1.50000  0.1504768
## cups       0.821039   0.75000  0.25000   1.50000  0.2327161
## rating    42.665705  40.40021 18.04285  93.70491 14.0472887

R을 이용하여 저온용과 고온용 시리얼에 함유된 칼로리를 비교하기 위해 박스플롯을 나란히 그리시오. 이를통해 무엇을 알 수 있는가?

cold_data = subset(cereals, type == "C")
hot_data = subset(cereals, type == "H")

par(mfcol = c(1,2))
boxplot(hot_data$calories, main = "Boxplot Hot-cereals' Calories", xlab = "Hot", ylab = "Calories")
boxplot(cold_data$calories, main = "Boxplot Cold-cereals' Calories", xlab = "Cold", ylab = "Calories")

R을 사용하여 시리얼이 전시된 진열대 높이에 따라 소비자 평점의 박스플롯을 나란히 그리시오. 진열대 높이로부터 소비자 평점을 예측할 수 있다면, 진열대 높이의 세가지 범주를 그대로 유지할 필요가 있겠는가?

boxplot(cereals$rating ~ cereals$shelf,
        main = "Boxplot for Rating by Shelf levels",
        xlab = "Shelf levels",
        ylab = "Rating")

양적 변수에 대한 상관계수 표를 작성하시오[cor()함수 사용]. 또한 이들 변수에 대한 산점도 행렬을 작성하시오.[plot(data)함수 사용]

sum(is.na(quan_data))

## [1] 4

na_rm_quan_data = na.omit(quan_data)
sum(is.na(na_rm_quan_data))

## [1] 0

round(cor(na_rm_quan_data),2)

##          calories protein   fat sodium fiber carbo sugars potass vitamins
## calories     1.00    0.03  0.51   0.30 -0.30  0.27   0.57  -0.07     0.26
## protein      0.03    1.00  0.20   0.01  0.51 -0.04  -0.29   0.58     0.05
## fat          0.51    0.20  1.00   0.00  0.01 -0.28   0.29   0.20    -0.03
## sodium       0.30    0.01  0.00   1.00 -0.07  0.33   0.04  -0.04     0.33
## fiber       -0.30    0.51  0.01  -0.07  1.00 -0.38  -0.15   0.91    -0.04
## carbo        0.27   -0.04 -0.28   0.33 -0.38  1.00  -0.45  -0.37     0.25
## sugars       0.57   -0.29  0.29   0.04 -0.15 -0.45   1.00   0.00     0.07
## potass      -0.07    0.58  0.20  -0.04  0.91 -0.37   0.00   1.00     0.00
## vitamins     0.26    0.05 -0.03   0.33 -0.04  0.25   0.07   0.00     1.00
## weight       0.70    0.23  0.22   0.31  0.25  0.14   0.46   0.42     0.32
## cups         0.09   -0.24 -0.16   0.12 -0.51  0.36  -0.03  -0.50     0.13
## rating      -0.69    0.47 -0.41  -0.38  0.60  0.06  -0.76   0.42    -0.21
##          weight  cups rating
## calories   0.70  0.09  -0.69
## protein    0.23 -0.24   0.47
## fat        0.22 -0.16  -0.41
## sodium     0.31  0.12  -0.38
## fiber      0.25 -0.51   0.60
## carbo      0.14  0.36   0.06
## sugars     0.46 -0.03  -0.76
## potass     0.42 -0.50   0.42
## vitamins   0.32  0.13  -0.21
## weight     1.00 -0.20  -0.30
## cups      -0.20  1.00  -0.22
## rating    -0.30 -0.22   1.00

panel_for_cor = function(x,y){
  usr = par("usr"); on.exit(par(usr))
  par(usr = c(0,1,0,1))
  r = round(cor(x,y), digit = 2)
  txt = paste0( r)
  cex.cor = 0.3 /strwidth(txt)
  text(0.5,0.5,txt,cex = -abs(cex.cor * r))
}

upper_panel = function(x,y){
  points(x,y,pch = 19, cex = 0.2)
}
plot(na_rm_quan_data, upper.panel = upper_panel,
     lower.panel = panel_for_cor,
     main = "Scatter Matrix for Quantatitive Variables"
     )

4.3 토요타 코롤라 자통차 판매.

ToyotaCorolla.csv 파일은 네덜란드에서 2004년 늦여름 동안 판매되었던 중고차(도요타 코롤라)에 대한 데이터이다. 1436개의 레코드에는 각 중고차의 가격(Price), 사용기간(Age), 주행거리(Kilometers), 마력(HP), 및 기타사양을 포함하여 38개의 특성에 관한 세부사항이포함되어 있다. 분석 목표는 사양에 따른 중고 도요타 코롤라의 가격을 예측하는 것이다.

범주형 변수를 확인하시오

toyota = read.csv("ToyotaCorolla.csv")
str(toyota)

## 'data.frame':    1436 obs. of  39 variables:
##  $ Id               : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Model            : Factor w/ 319 levels "TOYOTA Corolla ",..: 276 276 276 276 275 275 269 269 257 246 ...
##  $ Price            : int  13500 13750 13950 14950 13750 12950 16900 18600 21500 12950 ...
##  $ Age_08_04        : int  23 23 24 26 30 32 27 30 27 23 ...
##  $ Mfg_Month        : int  10 10 9 7 3 1 6 3 6 10 ...
##  $ Mfg_Year         : int  2002 2002 2002 2002 2002 2002 2002 2002 2002 2002 ...
##  $ KM               : int  46986 72937 41711 48000 38500 61000 94612 75889 19700 71138 ...
##  $ Fuel_Type        : Factor w/ 3 levels "CNG","Diesel",..: 2 2 2 2 2 2 2 2 3 2 ...
##  $ HP               : int  90 90 90 90 90 90 90 90 192 69 ...
##  $ Met_Color        : int  1 1 1 0 0 0 1 1 0 0 ...
##  $ Color            : Factor w/ 10 levels "Beige","Black",..: 3 7 3 2 2 9 5 5 6 3 ...
##  $ Automatic        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CC               : int  2000 2000 2000 2000 2000 2000 2000 2000 1800 1900 ...
##  $ Doors            : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Cylinders        : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ Gears            : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ Quarterly_Tax    : int  210 210 210 210 210 210 210 210 100 185 ...
##  $ Weight           : int  1165 1165 1165 1165 1170 1170 1245 1245 1185 1105 ...
##  $ Mfr_Guarantee    : int  0 0 1 1 1 0 0 1 0 0 ...
##  $ BOVAG_Guarantee  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Guarantee_Period : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ ABS              : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Airbag_1         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Airbag_2         : int  1 1 1 1 1 1 1 1 0 1 ...
##  $ Airco            : int  0 1 0 0 1 1 1 1 1 1 ...
##  $ Automatic_airco  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Boardcomputer    : int  1 1 1 1 1 1 1 1 0 1 ...
##  $ CD_Player        : int  0 1 0 0 0 0 0 1 0 0 ...
##  $ Central_Lock     : int  1 1 0 0 1 1 1 1 1 0 ...
##  $ Powered_Windows  : int  1 0 0 0 1 1 1 1 1 0 ...
##  $ Power_Steering   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Radio            : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ Mistlamps        : int  0 0 0 0 1 1 0 0 0 0 ...
##  $ Sport_Model      : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ Backseat_Divider : int  1 1 1 1 1 1 1 1 0 1 ...
##  $ Metallic_Rim     : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ Radio_cassette   : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ Parking_Assistant: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Tow_Bar          : int  0 0 0 0 0 0 0 0 0 0 ...

범주형 변수와 그로부터 파생된 이진형 가변수들의 관계를 설명하시오
N개의 범주를 가진 범주형 변수의 정보를 표현하기 위해 얼마나 많은 가변수들이 필요한가?
R을 사용하여 이 데이터세트에 있는 범주형 변수들을 이진형 가변숙로 변환하고, 하나의 레코드에 대해서 이진형 가변수들의 값을 말로 설명하시오.
R을 사용하여 상관행렬을 생성하고, 산점도 행렬을 작성하시오. 변수들 간의 관계에 대해 설명하시오.