# Read the data from the csv file.
processors <- read.csv("all-data.csv")
################################################################
#
# This function returns the data from the desired column.
# Example: clock<-get_column("Fp2000","Processor.Clock..MHz.")
get_column <- function(x,y) {
# x = string with the name of the desired benchmark
# y = desired column
#
# Find the indices of all rows that have an entry for the
# indicated benchmark
benchmark <- paste(paste("Spec",x,sep=""),"..average.base.",
sep="")
ix <- !is.na(processors[,benchmark])
return(processors[ix,y])
}
################################################################
################################################################
# This function extracts the interesting data columns for the given benchmark
# program and returns a dataframe with these columns.
extract_data <- function(benchmark) {
temp <- paste(paste("Spec",benchmark,sep=""),"..average.base.", sep="")
# perf = the performance reported in the database
perf <- get_column(benchmark,temp)
#nperf = performance normalized to the overall range
max_perf <- max(perf)
min_perf <- min(perf)
range <- max_perf - min_perf
nperf <- 100 * (perf - min_perf) / range
clock <- get_column(benchmark,"Processor.Clock..MHz.")
threads <- get_column(benchmark,"Threads.core")
cores <- get_column(benchmark,"Cores")
TDP <- get_column(benchmark,"TDP")
transistors <- get_column(benchmark,"Transistors..millions.")
dieSize <- get_column(benchmark,"Die.size..mm.2.")
voltage <- get_column(benchmark,"Voltage..low.")
featureSize <- get_column(benchmark,"Feature.Size..microns.")
channel <- get_column(benchmark,"Channel.length..microns.")
FO4delay <- get_column(benchmark,"FO4.Delay..ps.")
L1icache <- get_column(benchmark,"L1..instruction...on.chip.")
L1dcache <- get_column(benchmark,"L1..data...on.chip.")
L2cache <- get_column(benchmark,"L2..on.chip.")
L3cache <- get_column(benchmark,"L3..on.chip.")
return(data.frame(nperf,perf,clock,threads,cores,TDP,transistors,dieSize,voltage,featureSize,channel,FO4delay,L1icache,L1dcache,L2cache,L3cache))
}
################################################################
# Extract a new data frame for each of the benchmark programs available in the data set.
int92.dat <- extract_data("Int1992")
df <- int92.dat
head(df)
## nperf perf clock threads cores TDP transistors dieSize voltage
## 1 9.662070 68.60000 100 1 1 30 1.68 234 3.3
## 2 7.996196 63.10000 125 1 1 30 1.68 234 3.3
## 3 16.363872 90.72647 166 1 1 30 1.68 234 3.3
## 4 13.720745 82.00000 175 1 1 30 1.68 234 3.3
## 5 24.613994 117.96483 190 1 1 30 1.68 234 3.3
## 6 27.245938 126.65437 200 1 1 30 1.68 234 3.3
## featureSize channel FO4delay L1icache L1dcache L2cache L3cache
## 1 0.75 0.75 270 8 8 NA NA
## 2 0.75 0.75 270 8 8 NA NA
## 3 0.75 0.75 270 8 8 NA NA
## 4 0.75 0.75 270 8 8 NA NA
## 5 0.75 0.75 270 8 8 NA NA
## 6 0.75 0.75 270 8 8 NA NA
dependent variable = Performance && independent variable = clock
plot(df[,"clock"],df[,"perf"], main="Int2000",
xlab="Clock", ylab="Performance")
It is clear that there is a linear relationship between our 2 variables (clock and performance)
Let’s fit the model:
fit <- lm(perf~clock, data = df)
summary(fit)
##
## Call:
## lm(formula = perf ~ clock, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -112.677 -34.603 0.681 24.328 158.241
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.02525 12.24693 1.309 0.195
## clock 0.80239 0.07982 10.053 1.32e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 51.5 on 76 degrees of freedom
## Multiple R-squared: 0.5708, Adjusted R-squared: 0.5651
## F-statistic: 101.1 on 1 and 76 DF, p-value: 1.32e-15
The R-squared value 0.5708 tell us that 57% of the model is explained by the clock.
From above statistics: Intercept = 16.02525 and slope = 0.80239
p-value for the clock is very significate, since it’s less than 0.05 (if you can’t read the exponential notation, then the 3 stars near the p-value *** helps you see it’s very very significant :) ). There is no stars near the intercept p-value, so its p-value is 1 and therfore not significant, but we usually don’t care about the intercept’s significance.
Our regression model is: \(Performance = 16.02525 + 0.80239 * clock\)
Because R-squares is 0.5708 and it’s far from 1, perhaps if we add more explanatory variables, this R-squared can go up to 1 if these variables are significants; otherwise we will be penelized by the Adjusted R-squared. We only have one variable, so both R-square are close, with the Adjusted one always less.
Residual Analysis:
par(mfrow = c(2,2))
plot(fit)
From the quantile plot above, we can see that the residuals are normally distributed except on the very extreme ends.