1 Please read the train.csv file into R and store the data in a variable called “X”. https://www.kaggle.com/c/home-depot-product-search-relevance/data
X <- read.csv(file="C:\\CS 698\\Kaggle competition\\train.csv",header = T)
head(X)
## id product_uid
## 1 2 100001
## 2 3 100001
## 3 9 100002
## 4 16 100005
## 5 17 100005
## 6 18 100006
## product_title
## 1 Simpson Strong-Tie 12-Gauge Angle
## 2 Simpson Strong-Tie 12-Gauge Angle
## 3 BEHR Premium Textured DeckOver 1-gal. #SC-141 Tugboat Wood and Concrete Coating
## 4 Delta Vero 1-Handle Shower Only Faucet Trim Kit in Chrome (Valve Not Included)
## 5 Delta Vero 1-Handle Shower Only Faucet Trim Kit in Chrome (Valve Not Included)
## 6 Whirlpool 1.9 cu. ft. Over the Range Convection Microwave in Stainless Steel with Sensor Cooking
## search_term relevance
## 1 angle bracket 3.00
## 2 l bracket 2.50
## 3 deck over 3.00
## 4 rain shower head 2.33
## 5 shower only faucet 2.67
## 6 convection otr 3.00
2 Write a function, called “distinct_relevance”, to count how many distinct values are in the column “relevance”? So when we call the function, it returns the desired results: distinct_relevance (vect = X$relevance);
distinct_relevance <- function(vect= X$relevance){
num <- numeric(); j <-1
for ( i in 1:length(vect)){
if( !(vect[i] %in% num)){
num[j] <- vect[i]
j <- j+1
}
}
return(length(num))
}
distinct_relevance(X$relevance)
## [1] 13
3 Write a function, called “count”, to count the number of appearances of a value, e.g. 3, in the column “relevance”, so when we call the function, it returns the desired results: count(vect = X$relevance, value=3);
count <- function(vect = X$relevance, value=3){
num <- 0
for (i in seq_along(vect)){
if (vect[i] == value){
num <- num+1
}
}
return(num)
}
count(value=3)
## [1] 19125
4 Compare the results with R function: table()
table(X$relevance)
##
## 1 1.25 1.33 1.5 1.67 1.75 2 2.25 2.33 2.5 2.67 2.75
## 2105 4 3006 5 6780 9 11730 11 16060 19 15202 11
## 3
## 19125
5 How many terms does it take to get the first 3 digits to be correct, 3.14? Write an R function getPi(k) to compute it, where k specifies the first k digits to be correct, and returns #terms.
getPi <- function(k){
est <- 4;
nt <- 0
while(substr(est,1,k+1)!=substr(pi,1,k+1)){
i <- 0:nt
est <- sum( 4*(-1)^i/(2*i+1))
nt <- nt+1
}
return(nt)
}
getPi(3)
## [1] 119
distinct_relevance <- function(vect= X$relevance){
num <- numeric()
for ( i in 1:length(vect)){
if( !(vect[i] %in% num)){
num <- append(vect[i],num)
}
}
return(length(num))
}
distinct_relevance(X$relevance)
## [1] 13
length(unique(X$relevance))
## [1] 13
count <- function(vect = X$relevance, value=3){
sum(vect==value)
}
count()
## [1] 19125