knitr::opts_chunk$set(echo = TRUE)
#load libraries
library(psych)
library(relaimpo)
## Warning: package 'relaimpo' was built under R version 4.2.3
## Loading required package: MASS
## Loading required package: boot
##
## Attaching package: 'boot'
## The following object is masked from 'package:psych':
##
## logit
## Loading required package: survey
## Warning: package 'survey' was built under R version 4.2.3
## Loading required package: grid
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 4.2.3
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:boot':
##
## aml
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
## Loading required package: mitools
## Warning: package 'mitools' was built under R version 4.2.3
## This is the global version of package relaimpo.
## If you are a non-US user, a version with the interesting additional metric pmvd is available
## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
library(rcompanion)
## Warning: package 'rcompanion' was built under R version 4.2.3
##
## Attaching package: 'rcompanion'
## The following object is masked from 'package:psych':
##
## phi
library(MASS)
library(car)
## Warning: package 'car' was built under R version 4.2.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.2.3
##
## Attaching package: 'car'
## The following object is masked from 'package:boot':
##
## logit
## The following object is masked from 'package:psych':
##
## logit
library(leaps)
## Warning: package 'leaps' was built under R version 4.2.3
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(mice)
## Warning: package 'mice' was built under R version 4.2.3
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%() masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks mice::filter(), stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ dplyr::recode() masks car::recode()
## ✖ dplyr::select() masks MASS::select()
## ✖ purrr::some() masks car::some()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
#load data
food <- read.csv("pk_food_prices.csv")
dim(food)
## [1] 7663 18
#lot of missing values found
md.pattern(food, rotate.names = TRUE)
## X_id date cmname unit category price currency country admname adm1id
## 7662 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0 0 0 0
## mktname cmid catid sn mktid ptid umid default
## 7662 1 1 1 1 1 1 1 0 1
## 1 1 1 1 1 0 0 0 0 4
## 0 0 0 0 1 1 1 7663 7666
colSums(is.na(food))
## X_id date cmname unit category price currency country
## 0 0 0 0 0 0 0 0
## admname adm1id mktname mktid cmid ptid umid catid
## 0 0 0 1 0 1 1 0
## sn default
## 0 7663
#no duplicate data found
sum(duplicated(food))
## [1] 0
food %>% select(X_id, default,cmname) %>% group_by(cmname)
## # A tibble: 7,663 × 3
## # Groups: cmname [18]
## X_id default cmname
## <int> <lgl> <chr>
## 1 1 NA #item+name
## 2 2 NA Wheat flour - Retail
## 3 3 NA Wheat flour - Retail
## 4 4 NA Wheat flour - Retail
## 5 5 NA Wheat flour - Retail
## 6 6 NA Wheat flour - Retail
## 7 7 NA Wheat flour - Retail
## 8 8 NA Wheat flour - Retail
## 9 9 NA Wheat flour - Retail
## 10 10 NA Wheat flour - Retail
## # ℹ 7,653 more rows
#remove the variable default which is all empty
food$default <- NULL
dim(food)
## [1] 7663 17
#string split
#t[c('c1', 'c2','c3', 'c4')] <- str_split_fixed(t$complete_location, ' ', 4)
describe(food)
## vars n mean sd median trimmed mad min max range
## X_id 1 7663 3832.00 2212.26 3832 3832.00 2840.66 1 7663 7662
## date* 2 7663 132.54 46.90 145 138.44 38.55 1 191 190
## cmname* 3 7663 11.38 4.90 12 11.63 5.93 1 18 17
## unit* 4 7663 4.05 0.61 4 4.09 0.00 1 5 4
## category* 5 7663 4.12 2.30 3 3.90 1.48 1 8 7
## price* 6 7663 1515.76 847.96 1555 1526.09 994.82 1 2989 2988
## currency* 7 7663 2.00 0.01 2 2.00 0.00 1 2 1
## country* 8 7663 2.00 0.01 2 2.00 0.00 1 2 1
## admname* 9 7663 3.62 1.01 4 3.65 1.48 1 5 4
## adm1id* 10 7663 3.62 1.01 4 3.65 1.48 1 5 4
## mktname* 11 7663 3.97 1.41 4 3.97 1.48 1 6 5
## mktid 12 7662 292.98 1.40 293 292.97 1.48 291 295 4
## cmid* 13 7663 9.98 4.95 11 9.99 5.93 1 18 17
## ptid 14 7662 15.00 0.00 15 15.00 0.00 15 15 0
## umid 15 7662 9.75 10.77 5 6.87 0.00 5 51 46
## catid* 16 7663 4.06 2.26 3 3.83 1.48 1 8 7
## sn* 17 7663 43.56 24.28 45 43.52 31.13 1 86 85
## skew kurtosis se
## X_id 0.00 -1.20 25.27
## date* -1.00 0.09 0.54
## cmname* -0.33 -1.01 0.06
## unit* -1.12 3.66 0.01
## category* 0.48 -1.39 0.03
## price* -0.06 -1.11 9.69
## currency* -87.50 7656.00 0.00
## country* -87.50 7656.00 0.00
## admname* -0.29 -1.00 0.01
## adm1id* -0.29 -1.00 0.01
## mktname* 0.02 -1.29 0.02
## mktid 0.02 -1.29 0.02
## cmid* -0.12 -1.20 0.06
## ptid NaN NaN 0.00
## umid 2.76 7.02 0.12
## catid* 0.55 -1.26 0.03
## sn* 0.00 -1.19 0.28
str(food)
## 'data.frame': 7663 obs. of 17 variables:
## $ X_id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ date : chr "#date" "2004-01-15" "2004-02-15" "2004-03-15" ...
## $ cmname : chr "#item+name" "Wheat flour - Retail" "Wheat flour - Retail" "Wheat flour - Retail" ...
## $ unit : chr "#item+unit" "KG" "KG" "KG" ...
## $ category: chr "#item+type" "cereals and tubers" "cereals and tubers" "cereals and tubers" ...
## $ price : chr "#value" "13.0" "13.0" "14.25" ...
## $ currency: chr "#currency" "PKR" "PKR" "PKR" ...
## $ country : chr "#country+name" "Pakistan" "Pakistan" "Pakistan" ...
## $ admname : chr "#adm1+name" "Balochistan" "Balochistan" "Balochistan" ...
## $ adm1id : chr "#adm1+code" "2272" "2272" "2272" ...
## $ mktname : chr "#name+market" "Quetta" "Quetta" "Quetta" ...
## $ mktid : int NA 295 295 295 295 295 295 295 295 295 ...
## $ cmid : chr "#item+code" "58" "58" "58" ...
## $ ptid : int NA 15 15 15 15 15 15 15 15 15 ...
## $ umid : int NA 5 5 5 5 5 5 5 5 5 ...
## $ catid : chr "#item+type+code" "1" "1" "1" ...
## $ sn : chr "#meta+id" "295_58_15_5" "295_58_15_5" "295_58_15_5" ...
head(food)
## X_id date cmname unit category price
## 1 1 #date #item+name #item+unit #item+type #value
## 2 2 2004-01-15 Wheat flour - Retail KG cereals and tubers 13.0
## 3 3 2004-02-15 Wheat flour - Retail KG cereals and tubers 13.0
## 4 4 2004-03-15 Wheat flour - Retail KG cereals and tubers 14.25
## 5 5 2004-04-15 Wheat flour - Retail KG cereals and tubers 12.5
## 6 6 2004-05-15 Wheat flour - Retail KG cereals and tubers 13.25
## currency country admname adm1id mktname mktid cmid
## 1 #currency #country+name #adm1+name #adm1+code #name+market NA #item+code
## 2 PKR Pakistan Balochistan 2272 Quetta 295 58
## 3 PKR Pakistan Balochistan 2272 Quetta 295 58
## 4 PKR Pakistan Balochistan 2272 Quetta 295 58
## 5 PKR Pakistan Balochistan 2272 Quetta 295 58
## 6 PKR Pakistan Balochistan 2272 Quetta 295 58
## ptid umid catid sn
## 1 NA NA #item+type+code #meta+id
## 2 15 5 1 295_58_15_5
## 3 15 5 1 295_58_15_5
## 4 15 5 1 295_58_15_5
## 5 15 5 1 295_58_15_5
## 6 15 5 1 295_58_15_5
food <- food[-1,]
dim(food)
## [1] 7662 17
head(food)
## X_id date cmname unit category price currency
## 2 2 2004-01-15 Wheat flour - Retail KG cereals and tubers 13.0 PKR
## 3 3 2004-02-15 Wheat flour - Retail KG cereals and tubers 13.0 PKR
## 4 4 2004-03-15 Wheat flour - Retail KG cereals and tubers 14.25 PKR
## 5 5 2004-04-15 Wheat flour - Retail KG cereals and tubers 12.5 PKR
## 6 6 2004-05-15 Wheat flour - Retail KG cereals and tubers 13.25 PKR
## 7 7 2004-06-15 Wheat flour - Retail KG cereals and tubers 13.405 PKR
## country admname adm1id mktname mktid cmid ptid umid catid sn
## 2 Pakistan Balochistan 2272 Quetta 295 58 15 5 1 295_58_15_5
## 3 Pakistan Balochistan 2272 Quetta 295 58 15 5 1 295_58_15_5
## 4 Pakistan Balochistan 2272 Quetta 295 58 15 5 1 295_58_15_5
## 5 Pakistan Balochistan 2272 Quetta 295 58 15 5 1 295_58_15_5
## 6 Pakistan Balochistan 2272 Quetta 295 58 15 5 1 295_58_15_5
## 7 Pakistan Balochistan 2272 Quetta 295 58 15 5 1 295_58_15_5
#change item types
food$price <- as.numeric(food$price)
table(food$cmname)
##
## Beans(mash) - Retail
## 325
## Eggs - Retail
## 298
## Fuel (diesel) - Retail
## 350
## Fuel (petrol-gasoline) - Retail
## 295
## Ghee (artificial) - Retail
## 390
## Lentils (masur) - Retail
## 313
## Lentils (moong) - Retail
## 275
## Milk - Retail
## 265
## Oil (cooking) - Retail
## 392
## Poultry - Retail
## 392
## Rice (basmati, broken) - Retail
## 950
## Rice (coarse) - Retail
## 830
## Salt - Retail
## 205
## Sugar - Retail
## 392
## Wage (non-qualified labour, non-agricultural) - Retail
## 313
## Wheat - Retail
## 727
## Wheat flour - Retail
## 950
food[c("itemname", "retail")] <- str_split_fixed(food$cmname, '-', 2)
#remove redundant variables
food$cmname <- NULL
food$retail <- NULL
library(lubridate)
#missmap(food)
food$date <- as.Date(food$date, format="%m/%d/%Y")
food$year <- year(food$date)
food$month <- month(food$date, label = TRUE)
food$day <- wday(food$date, label = TRUE)
colSums(is.na(food))
## X_id date unit category price currency country admname
## 0 7662 0 0 0 0 0 0
## adm1id mktname mktid cmid ptid umid catid sn
## 0 0 0 0 0 0 0 0
## itemname year month day
## 0 7662 7662 7662
sum(duplicated(food))
## [1] 0
#for weekend label
#ifelse(a %in% c('Monday','Tuesday','Wednesday','Thursday','Friday'), 'Weekday', 'Weekend')
#ifelse(a %in% c('Sunday', 'Saturday'), 'Weekend', 'Weekday')
food$weekend <- ifelse(food$day %in% c('Sunday', 'Saturday'), 'Weekend', 'Weekday')
#remove date variable
food$date <- NULL
dim(food)
## [1] 7662 20
str(food)
## 'data.frame': 7662 obs. of 20 variables:
## $ X_id : int 2 3 4 5 6 7 8 9 10 11 ...
## $ unit : chr "KG" "KG" "KG" "KG" ...
## $ category: chr "cereals and tubers" "cereals and tubers" "cereals and tubers" "cereals and tubers" ...
## $ price : num 13 13 14.2 12.5 13.2 ...
## $ currency: chr "PKR" "PKR" "PKR" "PKR" ...
## $ country : chr "Pakistan" "Pakistan" "Pakistan" "Pakistan" ...
## $ admname : chr "Balochistan" "Balochistan" "Balochistan" "Balochistan" ...
## $ adm1id : chr "2272" "2272" "2272" "2272" ...
## $ mktname : chr "Quetta" "Quetta" "Quetta" "Quetta" ...
## $ mktid : int 295 295 295 295 295 295 295 295 295 295 ...
## $ cmid : chr "58" "58" "58" "58" ...
## $ ptid : int 15 15 15 15 15 15 15 15 15 15 ...
## $ umid : int 5 5 5 5 5 5 5 5 5 5 ...
## $ catid : chr "1" "1" "1" "1" ...
## $ sn : chr "295_58_15_5" "295_58_15_5" "295_58_15_5" "295_58_15_5" ...
## $ itemname: chr "Wheat flour " "Wheat flour " "Wheat flour " "Wheat flour " ...
## $ year : num NA NA NA NA NA NA NA NA NA NA ...
## $ month : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: NA NA NA NA NA NA NA NA NA NA ...
## $ day : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: NA NA NA NA NA NA NA NA NA NA ...
## $ weekend : chr "Weekday" "Weekday" "Weekday" "Weekday" ...
glimpse(food)
## Rows: 7,662
## Columns: 20
## $ X_id <int> 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 1…
## $ unit <chr> "KG", "KG", "KG", "KG", "KG", "KG", "KG", "KG", "KG", "KG", "…
## $ category <chr> "cereals and tubers", "cereals and tubers", "cereals and tube…
## $ price <dbl> 13.0000, 13.0000, 14.2500, 12.5000, 13.2500, 13.4050, 13.6000…
## $ currency <chr> "PKR", "PKR", "PKR", "PKR", "PKR", "PKR", "PKR", "PKR", "PKR"…
## $ country <chr> "Pakistan", "Pakistan", "Pakistan", "Pakistan", "Pakistan", "…
## $ admname <chr> "Balochistan", "Balochistan", "Balochistan", "Balochistan", "…
## $ adm1id <chr> "2272", "2272", "2272", "2272", "2272", "2272", "2272", "2272…
## $ mktname <chr> "Quetta", "Quetta", "Quetta", "Quetta", "Quetta", "Quetta", "…
## $ mktid <int> 295, 295, 295, 295, 295, 295, 295, 295, 295, 295, 295, 295, 2…
## $ cmid <chr> "58", "58", "58", "58", "58", "58", "58", "58", "58", "58", "…
## $ ptid <int> 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 1…
## $ umid <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5…
## $ catid <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "…
## $ sn <chr> "295_58_15_5", "295_58_15_5", "295_58_15_5", "295_58_15_5", "…
## $ itemname <chr> "Wheat flour ", "Wheat flour ", "Wheat flour ", "Wheat flour …
## $ year <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ month <ord> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ day <ord> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ weekend <chr> "Weekday", "Weekday", "Weekday", "Weekday", "Weekday", "Weekd…
All data is clean and no duplicate is found either
Our response variable is price check its spread
summary(food$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.00 36.92 72.73 106.92 138.40 997.00
Big difference between median and mean seems rightly skewed data look at its histogram
par(mfrow = c(1,2))
hist(food$price, main = "Histogram of prices", col = "lightblue")
plot(density(food$price), main = "Density plot of prices")
polygon(density(food$price), col = "brown")
In regression situations, rightly skewed data cause poorly fitting model. Use log transformation
food$price <- log(food$price)
summary(food$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.197 3.609 4.287 4.241 4.930 6.905
#look at visuals
par(mfrow = c(1,2))
hist(food$price, main = "Histogram of prices", col = "lightblue")
plot(density(food$price), main = "Density plot of Prices")
polygon(density(food$price), col = "orange")
Log transforamtion gives significant nromal spread. #check outliers
outliers_remover <- function(a){
df <- a
aa <- c()
count <- 1
for(i in 1:ncol(df)){
if(is.numeric(df[,i])){
Q3 <- quantile(df[,i], 0.75, na.rm = TRUE)
Q1 <- quantile(df[,i], 0.25, na.rm = TRUE)
IQR <- Q3 - Q1 #IQR(df[,i])
upper <- Q3 + 1.5 * IQR
lower <- Q1 - 1.5 * IQR
for(j in 1:nrow(df)){
if(is.na(df[j,i]) == TRUE){
next
}
else if(df[j,i] > upper | df[j,i] < lower){
aa[count] <- j
count <- count+1
}
}
}
}
df <- df[-aa,]
}
food_new <- outliers_remover(food)
str(food_new)
## 'data.frame': 5749 obs. of 20 variables:
## $ X_id : int 2 3 4 5 6 7 8 9 10 11 ...
## $ unit : chr "KG" "KG" "KG" "KG" ...
## $ category: chr "cereals and tubers" "cereals and tubers" "cereals and tubers" "cereals and tubers" ...
## $ price : num 2.56 2.56 2.66 2.53 2.58 ...
## $ currency: chr "PKR" "PKR" "PKR" "PKR" ...
## $ country : chr "Pakistan" "Pakistan" "Pakistan" "Pakistan" ...
## $ admname : chr "Balochistan" "Balochistan" "Balochistan" "Balochistan" ...
## $ adm1id : chr "2272" "2272" "2272" "2272" ...
## $ mktname : chr "Quetta" "Quetta" "Quetta" "Quetta" ...
## $ mktid : int 295 295 295 295 295 295 295 295 295 295 ...
## $ cmid : chr "58" "58" "58" "58" ...
## $ ptid : int 15 15 15 15 15 15 15 15 15 15 ...
## $ umid : int 5 5 5 5 5 5 5 5 5 5 ...
## $ catid : chr "1" "1" "1" "1" ...
## $ sn : chr "295_58_15_5" "295_58_15_5" "295_58_15_5" "295_58_15_5" ...
## $ itemname: chr "Wheat flour " "Wheat flour " "Wheat flour " "Wheat flour " ...
## $ year : num NA NA NA NA NA NA NA NA NA NA ...
## $ month : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: NA NA NA NA NA NA NA NA NA NA ...
## $ day : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: NA NA NA NA NA NA NA NA NA NA ...
## $ weekend : chr "Weekday" "Weekday" "Weekday" "Weekday" ...
Following outliers removal data shrunk from 7662 to 5273
prov_model <- lm(price ~ admname, data = food_new)
summary(prov_model)
##
## Call:
## lm(formula = price ~ admname, data = food_new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.80270 -0.54662 -0.08891 0.80983 1.74923
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.10529 0.02582 158.981 < 2e-16 ***
## admnameKhyber Pakhtunkhwa -0.13180 0.03568 -3.694 0.000223 ***
## admnamePunjab -0.14598 0.03116 -4.685 2.86e-06 ***
## admnameSindh -0.09818 0.03568 -2.752 0.005943 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8431 on 5745 degrees of freedom
## Multiple R-squared: 0.00404, Adjusted R-squared: 0.00352
## F-statistic: 7.769 on 3 and 5745 DF, p-value: 3.563e-05
mkt_model <- lm(price ~ mktname, data = food_new)
summary(mkt_model)
##
## Call:
## lm(formula = price ~ mktname, data = food_new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.80270 -0.54138 -0.09234 0.81390 1.71727
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.00711 0.02461 162.815 < 2e-16 ***
## mktnameLahore -0.01584 0.03484 -0.455 0.64925
## mktnameMultan -0.07976 0.03484 -2.290 0.02207 *
## mktnamePeshawar -0.03362 0.03481 -0.966 0.33419
## mktnameQuetta 0.09818 0.03567 2.752 0.00593 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8429 on 5744 degrees of freedom
## Multiple R-squared: 0.004623, Adjusted R-squared: 0.00393
## F-statistic: 6.669 on 4 and 5744 DF, p-value: 2.373e-05
catg_model <- lm(price ~ category, data = food_new)
summary(catg_model)
##
## Call:
## lm(formula = price ~ category, data = food_new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.28941 -0.20612 0.03255 0.30644 1.24335
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.564761 0.008233 432.998 < 2e-16 ***
## categorymeat, fish and eggs 1.484458 0.025797 57.543 < 2e-16 ***
## categorymiscellaneous food -0.078128 0.021454 -3.642 0.000273 ***
## categoryoil and fats 1.689127 0.025857 65.326 < 2e-16 ***
## categorypulses and nuts 1.426651 0.018012 79.208 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4841 on 5744 degrees of freedom
## Multiple R-squared: 0.6718, Adjusted R-squared: 0.6715
## F-statistic: 2939 on 4 and 5744 DF, p-value: < 2.2e-16
#Multiple Linear Regression
mul_model <- lm(price ~ admname + mktname + category + itemname, data = food_new)
summary(mul_model)
##
## Call:
## lm(formula = price ~ admname + mktname + category + itemname,
## data = food_new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.05536 -0.10669 0.05156 0.22451 0.85024
##
## Coefficients: (7 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.363349 0.014647 229.631 < 2e-16 ***
## admnameKhyber Pakhtunkhwa -0.069293 0.014669 -4.724 2.37e-06 ***
## admnamePunjab -0.113255 0.014679 -7.716 1.41e-14 ***
## admnameSindh -0.035459 0.014666 -2.418 0.0157 *
## mktnameLahore 0.063919 0.014293 4.472 7.89e-06 ***
## mktnameMultan NA NA NA NA
## mktnamePeshawar NA NA NA NA
## mktnameQuetta NA NA NA NA
## categorymeat, fish and eggs 1.739247 0.020744 83.845 < 2e-16 ***
## categorymiscellaneous food 0.768669 0.020744 37.056 < 2e-16 ***
## categoryoil and fats 1.944007 0.020781 93.546 < 2e-16 ***
## categorypulses and nuts 1.882639 0.022206 84.782 < 2e-16 ***
## itemnameGhee (artificial) NA NA NA NA
## itemnameLentils (masur) -0.354042 0.027366 -12.937 < 2e-16 ***
## itemnameLentils (moong) -0.264542 0.028313 -9.344 < 2e-16 ***
## itemnamePoultry NA NA NA NA
## itemnameRice (basmati, broken) 0.594522 0.015855 37.498 < 2e-16 ***
## itemnameRice (coarse) 0.380925 0.016418 23.202 < 2e-16 ***
## itemnameSalt -1.723951 0.029784 -57.882 < 2e-16 ***
## itemnameSugar NA NA NA NA
## itemnameWheat 0.007794 0.017080 0.456 0.6482
## itemnameWheat flour NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3456 on 5734 degrees of freedom
## Multiple R-squared: 0.833, Adjusted R-squared: 0.8326
## F-statistic: 2043 on 14 and 5734 DF, p-value: < 2.2e-16
summary(food$price - mul_model$fitted.values)
## Warning in food$price - mul_model$fitted.values: longer object length is not a
## multiple of shorter object length
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.6689 -0.5098 0.1890 0.2480 1.0964 3.4137
We want our median value to be centered around zero as this tells that are residuals are somewhat symmetrical and that model is predicting evenly at both high and low ends of our dataset. Our output shows that our model is quite symmetrical and is predicting both high and low prices. This can be further confirmed using quantile-quantile plot.
plot(mul_model)
#COefficents According the coefficients the equation is like price = 1.31 + (-0.02) * KPK + (-0.03) * punjab + (-0.01) * sindh + (0.02) * lahore + 0.31 * categorymeat, fish and egg + (-0.07)* categoryoil and fats + 0.3 * categorypulses and nuts
#Standard Error of Coefficient Standard error of coeff is an estimate of the standard deviaton of the coeff. This tells how much uncertainity there is with our coeff. The SE is often used to create confidence interval
#T-value T value is quick way to check if coeffient is signicant or not. T-statistic is coeff divided by SE.In general we want coeff to have large t-statistics this indicate that our SE is small in compariseon to our coeff. The larger T-statistic is, the more certain we can be that the coeff is not zero. The t-statistic is used to find p-value.
#p-value The p-value is calculated using the t-statistic from the T distribution. The p-value, in association with the t-statistic, help us to understand how significant our coefficient is to the model. In practice, any p-value below 0.05 is usually deemed as significant.It means we are confident that the coefficient is not zero, meaning the coefficient does in fact add value to the model by helping to explain the variance within our dependent variable. In our model, we can see that the p-values for the Intercept and points are extremely small. This leads us to conclude that there is strong evidence that the coefficients in this model are not zero.
#Residual Standard Error RSE The residual standard error is a measure of how well the model fits the data.In general we want the smaller the residual standard error possible, because that means our model’s prediction line is very close to actual values on average. In our model the actual values are 0.123 away from the predicted values or regression line.
#R-Squared The Multiple R-squared value is most often used for simple linear regression (one predictor). It tells us what percentage of the variation within our dependent variable that the independent variable is explaining. In other words, it’s another method to determine how well our model is fitting the data.In our model points explain 62% of the variation withing the price, our dependant variable.
#Adjusted R-Square The Adjusted R-squared value is used when running multiple linear regression and can conceptually be thought of in the same way we described Multiple R-squared. The Adjusted R-squared value shows what percentage of the variation within our dependent variable that all predictors are explaining. The difference between these two metrics is a nuance in the calculation where we adjust for the variance attributed by adding multiple variables.
#F-statistic and p-value When running a regression model, either simple or multiple, a hypothesis test is being run on the global model. The null hypothesis is that there is no relationship between the dependent variable and the independent variable(s) and the alternative hypothesis is that there is a relationship. Said another way, the null hypothesis is that the coefficients for all of the variables in your model are zero. The alternative hypothesis is that at least one of them is not zero.The F-statistic and overall p-value help us determine the result of this test. Looking at the F-statistic alone can be a little misleading depending on how many variables are in your test. If you have a lot of independent variables, it’s common for an F-statistic to be close to one and to still produce a p-value where we would reject the null hypothesis. However, for smaller models, a larger F-statistic generally indicates that the null hypothesis should be rejected. A better approach is to utilize the p-value that is associated with the F-statistic. Again, in practice, a p-value below 0.05 generally indicates that you have at least one coefficient in your model that isn’t zero. We can see from our model, the F-statistic is very large and our p-value is so small it is basically zero. This would lead us to reject the null hypothesis and conclude that there is strong evidence that a relationship does exist between salary and points.
#Choosing important vairales
#relative importance of variables
# relative_imp <- calc.relimp(mul_model, type = "lmg", rela = TRUE)
#
# sort(relative_imp$lmg, decreasing = TRUE)
#Stepwise regression Stepwise regression provides a number of stopping rules for selecting the best subset of variables for our model. In stepwise selection, variables are added to or deleted from a model one at a time, until some stopping criterion is reached.
# mul_model1 <- lm(price ~ admname + mktname + category, data = food_new)
#
# mul_model2 <- lm(price ~ admname + mktname, data = food_new)
#
# mul_model3 <- lm(price ~ category, data = food_new)
#
# com_mod <- compareLM(mul_model1, mul_model2)
# com_mod
mul_model <- lm(price ~ admname + mktname + category, data = food_new)
stepAIC(mul_model, direction = "backward")
## Start: AIC=-8377.45
## price ~ admname + mktname + category
##
##
## Step: AIC=-8377.45
## price ~ mktname + category
##
## Df Sum of Sq RSS AIC
## <none> 1334.7 -8377.4
## - mktname 4 11.19 1345.9 -8337.5
## - category 4 2746.53 4081.2 -1959.8
##
## Call:
## lm(formula = price ~ mktname + category, data = food_new)
##
## Coefficients:
## (Intercept) mktnameLahore
## 3.58110 -0.01490
## mktnameMultan mktnamePeshawar
## -0.07882 -0.03524
## mktnameQuetta categorymeat, fish and eggs
## 0.05805 1.48223
## categorymiscellaneous food categoryoil and fats
## -0.08033 1.68697
## categorypulses and nuts
## 1.42457
#Multicollinearity, Variation Inflation Factor (VIF)¶ Multicollinearity(or substantial correlation) generally occurs when there are high correlations between two or more predictor variables. In other words, one predictor variable can be used to predict the other. For example: a person’s hight and weight Not as before methods, the VIF checks the model.
As a general rule of thumb, a VIF for a predictor greater than 5 or 10 indicates that multicollinearity. or square-root(VIF) > 2 indicates multicollinearity problem.
# library(car)
# vif(mul_model1)
#
# #to predict which variables are perfectly correlated
# cor(food_new)
#best subsets
#subsets <- regsubsets(price ~ ., data = food_new, nbest = 2)
#check model assumptions we can create a histogram of residuals from a linear model. The distribution of these residuals should be approximately normal. We can find the residuals of a model by using residuals() function.
mod_re <- residuals(mul_model)
hist(mod_re)
The distribution is right skewed slightly.
#Check the linearity between residuals and predicted values. The residuals should be unbiased and homoscedastic
par(mfrow = c(2,2))
plot(mul_model)
food <- read.csv("pk_food_prices.csv")
food$price <- as.numeric(food$price)
## Warning: NAs introduced by coercion
food$price <- log(food$price)
#create training and Test data
trainingrowindex <- sample(1:nrow(food_new), 0.7 * nrow(food_new))
trainingData <- food_new[trainingrowindex, ] #train data
testData <- food_new[-trainingrowindex, ] #test data
#build model on training data
model1 <- lm(price ~ admname + mktname + category + itemname, data = trainingData)
#predict
pred1 <- predict(model1, testData)
## Warning in predict.lm(model1, testData): prediction from a rank-deficient fit
## may be misleading
actuals_pred1 <- data.frame(cbind(actuals = testData$price, predicted = pred1))
cor(actuals_pred1)
## actuals predicted
## actuals 1.0000000 0.9095486
## predicted 0.9095486 1.0000000