Data downloaded from the book site

# Read the data from the csv file.
processors <- read.csv("all-data.csv")


################################################################
#
# This function returns the data from the desired column.
# Example:  clock<-get_column("Fp2000","Processor.Clock..MHz.")

get_column <- function(x,y) {

# x = string with the name of the desired benchmark
# y = desired column
#
# Find the indices of all rows that have an entry for the  
# indicated benchmark
benchmark <- paste(paste("Spec",x,sep=""),"..average.base.",
    sep="")
ix <- !is.na(processors[,benchmark])
return(processors[ix,y])
}
################################################################




################################################################
# This function extracts the interesting data columns for the given benchmark
# program and returns a dataframe with these columns.

extract_data <- function(benchmark) {

temp <- paste(paste("Spec",benchmark,sep=""),"..average.base.", sep="")

# perf = the performance reported in the database
perf <- get_column(benchmark,temp)

#nperf = performance normalized to the overall range
max_perf <- max(perf)
min_perf <- min(perf)
range <- max_perf - min_perf
nperf <- 100 * (perf - min_perf) / range

clock <- get_column(benchmark,"Processor.Clock..MHz.")
threads <- get_column(benchmark,"Threads.core")
cores <- get_column(benchmark,"Cores")
TDP <- get_column(benchmark,"TDP")
transistors <- get_column(benchmark,"Transistors..millions.")
dieSize <- get_column(benchmark,"Die.size..mm.2.")
voltage <- get_column(benchmark,"Voltage..low.")
featureSize <- get_column(benchmark,"Feature.Size..microns.")
channel <- get_column(benchmark,"Channel.length..microns.")
FO4delay <- get_column(benchmark,"FO4.Delay..ps.")
L1icache <- get_column(benchmark,"L1..instruction...on.chip.")
L1dcache <- get_column(benchmark,"L1..data...on.chip.")
L2cache <- get_column(benchmark,"L2..on.chip.")
L3cache <- get_column(benchmark,"L3..on.chip.")

return(data.frame(nperf,perf,clock,threads,cores,TDP,transistors,dieSize,voltage,featureSize,channel,FO4delay,L1icache,L1dcache,L2cache,L3cache))

}
################################################################


# Extract a new data frame for each of the benchmark programs available in the data set.

int92.dat <- extract_data("Int1992")
df <- int92.dat
head(df)
##       nperf      perf clock threads cores TDP transistors dieSize voltage
## 1  9.662070  68.60000   100       1     1  30        1.68     234     3.3
## 2  7.996196  63.10000   125       1     1  30        1.68     234     3.3
## 3 16.363872  90.72647   166       1     1  30        1.68     234     3.3
## 4 13.720745  82.00000   175       1     1  30        1.68     234     3.3
## 5 24.613994 117.96483   190       1     1  30        1.68     234     3.3
## 6 27.245938 126.65437   200       1     1  30        1.68     234     3.3
##   featureSize channel FO4delay L1icache L1dcache L2cache L3cache
## 1        0.75    0.75      270        8        8      NA      NA
## 2        0.75    0.75      270        8        8      NA      NA
## 3        0.75    0.75      270        8        8      NA      NA
## 4        0.75    0.75      270        8        8      NA      NA
## 5        0.75    0.75      270        8        8      NA      NA
## 6        0.75    0.75      270        8        8      NA      NA

dependent variable = Performance && independent variable = clock

plot(df[,"clock"],df[,"perf"], main="Int2000",
    xlab="Clock", ylab="Performance")

It is clear that there is a linear relationship between our 2 variables (clock and performance)

Let’s fit the model:

fit <- lm(perf~clock, data = df)
summary(fit)
## 
## Call:
## lm(formula = perf ~ clock, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -112.677  -34.603    0.681   24.328  158.241 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 16.02525   12.24693   1.309    0.195    
## clock        0.80239    0.07982  10.053 1.32e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 51.5 on 76 degrees of freedom
## Multiple R-squared:  0.5708, Adjusted R-squared:  0.5651 
## F-statistic: 101.1 on 1 and 76 DF,  p-value: 1.32e-15

The R-squared value 0.5708 tell us that 57% of the model is explained by the clock.

From above statistics: Intercept = 16.02525 and slope = 0.80239

p-value for the clock is very significate, since it’s less than 0.05 (if you can’t read the exponential notation, then the 3 stars near the p-value *** helps you see it’s very very significant :) ). There is no stars near the intercept p-value, so its p-value is 1 and therfore not significant, but we usually don’t care about the intercept’s significance.

Our regression model is: \(Performance = 16.02525 + 0.80239 * clock\)

Because R-squares is 0.5708 and it’s far from 1, perhaps if we add more explanatory variables, this R-squared can go up to 1 if these variables are significants; otherwise we will be penelized by the Adjusted R-squared. We only have one variable, so both R-square are close, with the Adjusted one always less.

Residual Analysis:

par(mfrow = c(2,2)) 
plot(fit)

From the quantile plot above, we can see that the residuals are normally distributed except on the very extreme ends.