knitr::opts_chunk$set(echo = TRUE)

#load libraries
library(psych)
library(relaimpo)
## Warning: package 'relaimpo' was built under R version 4.2.3
## Loading required package: MASS
## Loading required package: boot
## 
## Attaching package: 'boot'
## The following object is masked from 'package:psych':
## 
##     logit
## Loading required package: survey
## Warning: package 'survey' was built under R version 4.2.3
## Loading required package: grid
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 4.2.3
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:boot':
## 
##     aml
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
## Loading required package: mitools
## Warning: package 'mitools' was built under R version 4.2.3
## This is the global version of package relaimpo.
## If you are a non-US user, a version with the interesting additional metric pmvd is available
## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
library(rcompanion)
## Warning: package 'rcompanion' was built under R version 4.2.3
## 
## Attaching package: 'rcompanion'
## The following object is masked from 'package:psych':
## 
##     phi
library(MASS)
library(car)
## Warning: package 'car' was built under R version 4.2.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.2.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:boot':
## 
##     logit
## The following object is masked from 'package:psych':
## 
##     logit
library(leaps)
## Warning: package 'leaps' was built under R version 4.2.3
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(mice)
## Warning: package 'mice' was built under R version 4.2.3
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%()   masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ tidyr::expand()  masks Matrix::expand()
## ✖ dplyr::filter()  masks mice::filter(), stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ tidyr::pack()    masks Matrix::pack()
## ✖ dplyr::recode()  masks car::recode()
## ✖ dplyr::select()  masks MASS::select()
## ✖ purrr::some()    masks car::some()
## ✖ tidyr::unpack()  masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)

#load data
food <- read.csv("pk_food_prices.csv")
dim(food)
## [1] 7663   18
#lot of missing values found
md.pattern(food, rotate.names = TRUE)

##      X_id date cmname unit category price currency country admname adm1id
## 7662    1    1      1    1        1     1        1       1       1      1
## 1       1    1      1    1        1     1        1       1       1      1
##         0    0      0    0        0     0        0       0       0      0
##      mktname cmid catid sn mktid ptid umid default     
## 7662       1    1     1  1     1    1    1       0    1
## 1          1    1     1  1     0    0    0       0    4
##            0    0     0  0     1    1    1    7663 7666
colSums(is.na(food))
##     X_id     date   cmname     unit category    price currency  country 
##        0        0        0        0        0        0        0        0 
##  admname   adm1id  mktname    mktid     cmid     ptid     umid    catid 
##        0        0        0        1        0        1        1        0 
##       sn  default 
##        0     7663
#no duplicate data found
sum(duplicated(food))
## [1] 0
food %>% select(X_id, default,cmname) %>% group_by(cmname)
## # A tibble: 7,663 × 3
## # Groups:   cmname [18]
##     X_id default cmname              
##    <int> <lgl>   <chr>               
##  1     1 NA      #item+name          
##  2     2 NA      Wheat flour - Retail
##  3     3 NA      Wheat flour - Retail
##  4     4 NA      Wheat flour - Retail
##  5     5 NA      Wheat flour - Retail
##  6     6 NA      Wheat flour - Retail
##  7     7 NA      Wheat flour - Retail
##  8     8 NA      Wheat flour - Retail
##  9     9 NA      Wheat flour - Retail
## 10    10 NA      Wheat flour - Retail
## # ℹ 7,653 more rows
#remove the variable default which is all empty
food$default <- NULL
dim(food)
## [1] 7663   17
#string split
#t[c('c1', 'c2','c3', 'c4')] <- str_split_fixed(t$complete_location,  ' ', 4)

describe(food)
##           vars    n    mean      sd median trimmed     mad min  max range
## X_id         1 7663 3832.00 2212.26   3832 3832.00 2840.66   1 7663  7662
## date*        2 7663  132.54   46.90    145  138.44   38.55   1  191   190
## cmname*      3 7663   11.38    4.90     12   11.63    5.93   1   18    17
## unit*        4 7663    4.05    0.61      4    4.09    0.00   1    5     4
## category*    5 7663    4.12    2.30      3    3.90    1.48   1    8     7
## price*       6 7663 1515.76  847.96   1555 1526.09  994.82   1 2989  2988
## currency*    7 7663    2.00    0.01      2    2.00    0.00   1    2     1
## country*     8 7663    2.00    0.01      2    2.00    0.00   1    2     1
## admname*     9 7663    3.62    1.01      4    3.65    1.48   1    5     4
## adm1id*     10 7663    3.62    1.01      4    3.65    1.48   1    5     4
## mktname*    11 7663    3.97    1.41      4    3.97    1.48   1    6     5
## mktid       12 7662  292.98    1.40    293  292.97    1.48 291  295     4
## cmid*       13 7663    9.98    4.95     11    9.99    5.93   1   18    17
## ptid        14 7662   15.00    0.00     15   15.00    0.00  15   15     0
## umid        15 7662    9.75   10.77      5    6.87    0.00   5   51    46
## catid*      16 7663    4.06    2.26      3    3.83    1.48   1    8     7
## sn*         17 7663   43.56   24.28     45   43.52   31.13   1   86    85
##             skew kurtosis    se
## X_id        0.00    -1.20 25.27
## date*      -1.00     0.09  0.54
## cmname*    -0.33    -1.01  0.06
## unit*      -1.12     3.66  0.01
## category*   0.48    -1.39  0.03
## price*     -0.06    -1.11  9.69
## currency* -87.50  7656.00  0.00
## country*  -87.50  7656.00  0.00
## admname*   -0.29    -1.00  0.01
## adm1id*    -0.29    -1.00  0.01
## mktname*    0.02    -1.29  0.02
## mktid       0.02    -1.29  0.02
## cmid*      -0.12    -1.20  0.06
## ptid         NaN      NaN  0.00
## umid        2.76     7.02  0.12
## catid*      0.55    -1.26  0.03
## sn*         0.00    -1.19  0.28
str(food)
## 'data.frame':    7663 obs. of  17 variables:
##  $ X_id    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ date    : chr  "#date" "2004-01-15" "2004-02-15" "2004-03-15" ...
##  $ cmname  : chr  "#item+name" "Wheat flour - Retail" "Wheat flour - Retail" "Wheat flour - Retail" ...
##  $ unit    : chr  "#item+unit" "KG" "KG" "KG" ...
##  $ category: chr  "#item+type" "cereals and tubers" "cereals and tubers" "cereals and tubers" ...
##  $ price   : chr  "#value" "13.0" "13.0" "14.25" ...
##  $ currency: chr  "#currency" "PKR" "PKR" "PKR" ...
##  $ country : chr  "#country+name" "Pakistan" "Pakistan" "Pakistan" ...
##  $ admname : chr  "#adm1+name" "Balochistan" "Balochistan" "Balochistan" ...
##  $ adm1id  : chr  "#adm1+code" "2272" "2272" "2272" ...
##  $ mktname : chr  "#name+market" "Quetta" "Quetta" "Quetta" ...
##  $ mktid   : int  NA 295 295 295 295 295 295 295 295 295 ...
##  $ cmid    : chr  "#item+code" "58" "58" "58" ...
##  $ ptid    : int  NA 15 15 15 15 15 15 15 15 15 ...
##  $ umid    : int  NA 5 5 5 5 5 5 5 5 5 ...
##  $ catid   : chr  "#item+type+code" "1" "1" "1" ...
##  $ sn      : chr  "#meta+id" "295_58_15_5" "295_58_15_5" "295_58_15_5" ...
head(food)
##   X_id       date               cmname       unit           category  price
## 1    1      #date           #item+name #item+unit         #item+type #value
## 2    2 2004-01-15 Wheat flour - Retail         KG cereals and tubers   13.0
## 3    3 2004-02-15 Wheat flour - Retail         KG cereals and tubers   13.0
## 4    4 2004-03-15 Wheat flour - Retail         KG cereals and tubers  14.25
## 5    5 2004-04-15 Wheat flour - Retail         KG cereals and tubers   12.5
## 6    6 2004-05-15 Wheat flour - Retail         KG cereals and tubers  13.25
##    currency       country     admname     adm1id      mktname mktid       cmid
## 1 #currency #country+name  #adm1+name #adm1+code #name+market    NA #item+code
## 2       PKR      Pakistan Balochistan       2272       Quetta   295         58
## 3       PKR      Pakistan Balochistan       2272       Quetta   295         58
## 4       PKR      Pakistan Balochistan       2272       Quetta   295         58
## 5       PKR      Pakistan Balochistan       2272       Quetta   295         58
## 6       PKR      Pakistan Balochistan       2272       Quetta   295         58
##   ptid umid           catid          sn
## 1   NA   NA #item+type+code    #meta+id
## 2   15    5               1 295_58_15_5
## 3   15    5               1 295_58_15_5
## 4   15    5               1 295_58_15_5
## 5   15    5               1 295_58_15_5
## 6   15    5               1 295_58_15_5
food <- food[-1,]
dim(food)
## [1] 7662   17
head(food)
##   X_id       date               cmname unit           category  price currency
## 2    2 2004-01-15 Wheat flour - Retail   KG cereals and tubers   13.0      PKR
## 3    3 2004-02-15 Wheat flour - Retail   KG cereals and tubers   13.0      PKR
## 4    4 2004-03-15 Wheat flour - Retail   KG cereals and tubers  14.25      PKR
## 5    5 2004-04-15 Wheat flour - Retail   KG cereals and tubers   12.5      PKR
## 6    6 2004-05-15 Wheat flour - Retail   KG cereals and tubers  13.25      PKR
## 7    7 2004-06-15 Wheat flour - Retail   KG cereals and tubers 13.405      PKR
##    country     admname adm1id mktname mktid cmid ptid umid catid          sn
## 2 Pakistan Balochistan   2272  Quetta   295   58   15    5     1 295_58_15_5
## 3 Pakistan Balochistan   2272  Quetta   295   58   15    5     1 295_58_15_5
## 4 Pakistan Balochistan   2272  Quetta   295   58   15    5     1 295_58_15_5
## 5 Pakistan Balochistan   2272  Quetta   295   58   15    5     1 295_58_15_5
## 6 Pakistan Balochistan   2272  Quetta   295   58   15    5     1 295_58_15_5
## 7 Pakistan Balochistan   2272  Quetta   295   58   15    5     1 295_58_15_5
#change item types
food$price <- as.numeric(food$price)
table(food$cmname)
## 
##                                   Beans(mash) - Retail 
##                                                    325 
##                                          Eggs - Retail 
##                                                    298 
##                                 Fuel (diesel) - Retail 
##                                                    350 
##                        Fuel (petrol-gasoline) - Retail 
##                                                    295 
##                             Ghee (artificial) - Retail 
##                                                    390 
##                               Lentils (masur) - Retail 
##                                                    313 
##                               Lentils (moong) - Retail 
##                                                    275 
##                                          Milk - Retail 
##                                                    265 
##                                 Oil (cooking) - Retail 
##                                                    392 
##                                       Poultry - Retail 
##                                                    392 
##                        Rice (basmati, broken) - Retail 
##                                                    950 
##                                 Rice (coarse) - Retail 
##                                                    830 
##                                          Salt - Retail 
##                                                    205 
##                                         Sugar - Retail 
##                                                    392 
## Wage (non-qualified labour, non-agricultural) - Retail 
##                                                    313 
##                                         Wheat - Retail 
##                                                    727 
##                                   Wheat flour - Retail 
##                                                    950
food[c("itemname", "retail")] <- str_split_fixed(food$cmname, '-', 2)
#remove redundant variables
food$cmname <- NULL
food$retail <- NULL

library(lubridate)
#missmap(food)

food$date <- as.Date(food$date, format="%m/%d/%Y")
food$year <- year(food$date)
food$month <- month(food$date, label = TRUE)
food$day <- wday(food$date, label = TRUE)


colSums(is.na(food))
##     X_id     date     unit category    price currency  country  admname 
##        0     7662        0        0        0        0        0        0 
##   adm1id  mktname    mktid     cmid     ptid     umid    catid       sn 
##        0        0        0        0        0        0        0        0 
## itemname     year    month      day 
##        0     7662     7662     7662
sum(duplicated(food))
## [1] 0
#for weekend label
#ifelse(a %in% c('Monday','Tuesday','Wednesday','Thursday','Friday'), 'Weekday', 'Weekend')

#ifelse(a %in% c('Sunday', 'Saturday'), 'Weekend', 'Weekday')

food$weekend <- ifelse(food$day %in% c('Sunday', 'Saturday'), 'Weekend', 'Weekday')

#remove date variable
food$date <- NULL

dim(food)
## [1] 7662   20
str(food)
## 'data.frame':    7662 obs. of  20 variables:
##  $ X_id    : int  2 3 4 5 6 7 8 9 10 11 ...
##  $ unit    : chr  "KG" "KG" "KG" "KG" ...
##  $ category: chr  "cereals and tubers" "cereals and tubers" "cereals and tubers" "cereals and tubers" ...
##  $ price   : num  13 13 14.2 12.5 13.2 ...
##  $ currency: chr  "PKR" "PKR" "PKR" "PKR" ...
##  $ country : chr  "Pakistan" "Pakistan" "Pakistan" "Pakistan" ...
##  $ admname : chr  "Balochistan" "Balochistan" "Balochistan" "Balochistan" ...
##  $ adm1id  : chr  "2272" "2272" "2272" "2272" ...
##  $ mktname : chr  "Quetta" "Quetta" "Quetta" "Quetta" ...
##  $ mktid   : int  295 295 295 295 295 295 295 295 295 295 ...
##  $ cmid    : chr  "58" "58" "58" "58" ...
##  $ ptid    : int  15 15 15 15 15 15 15 15 15 15 ...
##  $ umid    : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ catid   : chr  "1" "1" "1" "1" ...
##  $ sn      : chr  "295_58_15_5" "295_58_15_5" "295_58_15_5" "295_58_15_5" ...
##  $ itemname: chr  "Wheat flour " "Wheat flour " "Wheat flour " "Wheat flour " ...
##  $ year    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ month   : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: NA NA NA NA NA NA NA NA NA NA ...
##  $ day     : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: NA NA NA NA NA NA NA NA NA NA ...
##  $ weekend : chr  "Weekday" "Weekday" "Weekday" "Weekday" ...
glimpse(food)
## Rows: 7,662
## Columns: 20
## $ X_id     <int> 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 1…
## $ unit     <chr> "KG", "KG", "KG", "KG", "KG", "KG", "KG", "KG", "KG", "KG", "…
## $ category <chr> "cereals and tubers", "cereals and tubers", "cereals and tube…
## $ price    <dbl> 13.0000, 13.0000, 14.2500, 12.5000, 13.2500, 13.4050, 13.6000…
## $ currency <chr> "PKR", "PKR", "PKR", "PKR", "PKR", "PKR", "PKR", "PKR", "PKR"…
## $ country  <chr> "Pakistan", "Pakistan", "Pakistan", "Pakistan", "Pakistan", "…
## $ admname  <chr> "Balochistan", "Balochistan", "Balochistan", "Balochistan", "…
## $ adm1id   <chr> "2272", "2272", "2272", "2272", "2272", "2272", "2272", "2272…
## $ mktname  <chr> "Quetta", "Quetta", "Quetta", "Quetta", "Quetta", "Quetta", "…
## $ mktid    <int> 295, 295, 295, 295, 295, 295, 295, 295, 295, 295, 295, 295, 2…
## $ cmid     <chr> "58", "58", "58", "58", "58", "58", "58", "58", "58", "58", "…
## $ ptid     <int> 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 1…
## $ umid     <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5…
## $ catid    <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "…
## $ sn       <chr> "295_58_15_5", "295_58_15_5", "295_58_15_5", "295_58_15_5", "…
## $ itemname <chr> "Wheat flour ", "Wheat flour ", "Wheat flour ", "Wheat flour …
## $ year     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ month    <ord> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ day      <ord> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ weekend  <chr> "Weekday", "Weekday", "Weekday", "Weekday", "Weekday", "Weekd…

All data is clean and no duplicate is found either

Our response variable is price check its spread

summary(food$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    9.00   36.92   72.73  106.92  138.40  997.00

Big difference between median and mean seems rightly skewed data look at its histogram

par(mfrow = c(1,2))
hist(food$price, main = "Histogram of prices", col = "lightblue")
plot(density(food$price), main = "Density plot of prices")
polygon(density(food$price), col = "brown")

In regression situations, rightly skewed data cause poorly fitting model. Use log transformation

food$price <- log(food$price)
summary(food$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.197   3.609   4.287   4.241   4.930   6.905
#look at visuals
par(mfrow = c(1,2))
hist(food$price, main = "Histogram of prices", col = "lightblue")
plot(density(food$price), main = "Density plot of Prices")
polygon(density(food$price), col = "orange")

Log transforamtion gives significant nromal spread. #check outliers

outliers_remover <- function(a){
  df <- a
  aa <- c()
  count <- 1
  for(i in 1:ncol(df)){
    if(is.numeric(df[,i])){
      Q3 <- quantile(df[,i], 0.75, na.rm = TRUE)
      Q1 <- quantile(df[,i], 0.25, na.rm = TRUE) 
      IQR <- Q3 - Q1  #IQR(df[,i])
      upper <- Q3 + 1.5 * IQR
      lower <- Q1 - 1.5 * IQR
      for(j in 1:nrow(df)){
        if(is.na(df[j,i]) == TRUE){
            next
        }
            
        else if(df[j,i] > upper | df[j,i] < lower){
          aa[count] <- j
          count <- count+1                  
        }
      }
    }
  }
  
  df <- df[-aa,]
}

food_new <- outliers_remover(food)
str(food_new)
## 'data.frame':    5749 obs. of  20 variables:
##  $ X_id    : int  2 3 4 5 6 7 8 9 10 11 ...
##  $ unit    : chr  "KG" "KG" "KG" "KG" ...
##  $ category: chr  "cereals and tubers" "cereals and tubers" "cereals and tubers" "cereals and tubers" ...
##  $ price   : num  2.56 2.56 2.66 2.53 2.58 ...
##  $ currency: chr  "PKR" "PKR" "PKR" "PKR" ...
##  $ country : chr  "Pakistan" "Pakistan" "Pakistan" "Pakistan" ...
##  $ admname : chr  "Balochistan" "Balochistan" "Balochistan" "Balochistan" ...
##  $ adm1id  : chr  "2272" "2272" "2272" "2272" ...
##  $ mktname : chr  "Quetta" "Quetta" "Quetta" "Quetta" ...
##  $ mktid   : int  295 295 295 295 295 295 295 295 295 295 ...
##  $ cmid    : chr  "58" "58" "58" "58" ...
##  $ ptid    : int  15 15 15 15 15 15 15 15 15 15 ...
##  $ umid    : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ catid   : chr  "1" "1" "1" "1" ...
##  $ sn      : chr  "295_58_15_5" "295_58_15_5" "295_58_15_5" "295_58_15_5" ...
##  $ itemname: chr  "Wheat flour " "Wheat flour " "Wheat flour " "Wheat flour " ...
##  $ year    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ month   : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: NA NA NA NA NA NA NA NA NA NA ...
##  $ day     : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: NA NA NA NA NA NA NA NA NA NA ...
##  $ weekend : chr  "Weekday" "Weekday" "Weekday" "Weekday" ...

Following outliers removal data shrunk from 7662 to 5273

prov_model <- lm(price ~ admname, data = food_new)
summary(prov_model)
## 
## Call:
## lm(formula = price ~ admname, data = food_new)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.80270 -0.54662 -0.08891  0.80983  1.74923 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                4.10529    0.02582 158.981  < 2e-16 ***
## admnameKhyber Pakhtunkhwa -0.13180    0.03568  -3.694 0.000223 ***
## admnamePunjab             -0.14598    0.03116  -4.685 2.86e-06 ***
## admnameSindh              -0.09818    0.03568  -2.752 0.005943 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8431 on 5745 degrees of freedom
## Multiple R-squared:  0.00404,    Adjusted R-squared:  0.00352 
## F-statistic: 7.769 on 3 and 5745 DF,  p-value: 3.563e-05
mkt_model <- lm(price ~ mktname, data = food_new)
summary(mkt_model)
## 
## Call:
## lm(formula = price ~ mktname, data = food_new)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.80270 -0.54138 -0.09234  0.81390  1.71727 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      4.00711    0.02461 162.815  < 2e-16 ***
## mktnameLahore   -0.01584    0.03484  -0.455  0.64925    
## mktnameMultan   -0.07976    0.03484  -2.290  0.02207 *  
## mktnamePeshawar -0.03362    0.03481  -0.966  0.33419    
## mktnameQuetta    0.09818    0.03567   2.752  0.00593 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8429 on 5744 degrees of freedom
## Multiple R-squared:  0.004623,   Adjusted R-squared:  0.00393 
## F-statistic: 6.669 on 4 and 5744 DF,  p-value: 2.373e-05
catg_model <- lm(price ~ category, data = food_new)
summary(catg_model)
## 
## Call:
## lm(formula = price ~ category, data = food_new)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.28941 -0.20612  0.03255  0.30644  1.24335 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  3.564761   0.008233 432.998  < 2e-16 ***
## categorymeat, fish and eggs  1.484458   0.025797  57.543  < 2e-16 ***
## categorymiscellaneous food  -0.078128   0.021454  -3.642 0.000273 ***
## categoryoil and fats         1.689127   0.025857  65.326  < 2e-16 ***
## categorypulses and nuts      1.426651   0.018012  79.208  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4841 on 5744 degrees of freedom
## Multiple R-squared:  0.6718, Adjusted R-squared:  0.6715 
## F-statistic:  2939 on 4 and 5744 DF,  p-value: < 2.2e-16

#Multiple Linear Regression

mul_model <- lm(price ~ admname + mktname + category + itemname, data = food_new)
summary(mul_model)
## 
## Call:
## lm(formula = price ~ admname + mktname + category + itemname, 
##     data = food_new)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.05536 -0.10669  0.05156  0.22451  0.85024 
## 
## Coefficients: (7 not defined because of singularities)
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      3.363349   0.014647 229.631  < 2e-16 ***
## admnameKhyber Pakhtunkhwa       -0.069293   0.014669  -4.724 2.37e-06 ***
## admnamePunjab                   -0.113255   0.014679  -7.716 1.41e-14 ***
## admnameSindh                    -0.035459   0.014666  -2.418   0.0157 *  
## mktnameLahore                    0.063919   0.014293   4.472 7.89e-06 ***
## mktnameMultan                          NA         NA      NA       NA    
## mktnamePeshawar                        NA         NA      NA       NA    
## mktnameQuetta                          NA         NA      NA       NA    
## categorymeat, fish and eggs      1.739247   0.020744  83.845  < 2e-16 ***
## categorymiscellaneous food       0.768669   0.020744  37.056  < 2e-16 ***
## categoryoil and fats             1.944007   0.020781  93.546  < 2e-16 ***
## categorypulses and nuts          1.882639   0.022206  84.782  < 2e-16 ***
## itemnameGhee (artificial)              NA         NA      NA       NA    
## itemnameLentils (masur)         -0.354042   0.027366 -12.937  < 2e-16 ***
## itemnameLentils (moong)         -0.264542   0.028313  -9.344  < 2e-16 ***
## itemnamePoultry                        NA         NA      NA       NA    
## itemnameRice (basmati, broken)   0.594522   0.015855  37.498  < 2e-16 ***
## itemnameRice (coarse)            0.380925   0.016418  23.202  < 2e-16 ***
## itemnameSalt                    -1.723951   0.029784 -57.882  < 2e-16 ***
## itemnameSugar                          NA         NA      NA       NA    
## itemnameWheat                    0.007794   0.017080   0.456   0.6482    
## itemnameWheat flour                    NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3456 on 5734 degrees of freedom
## Multiple R-squared:  0.833,  Adjusted R-squared:  0.8326 
## F-statistic:  2043 on 14 and 5734 DF,  p-value: < 2.2e-16
summary(food$price - mul_model$fitted.values)
## Warning in food$price - mul_model$fitted.values: longer object length is not a
## multiple of shorter object length
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.6689 -0.5098  0.1890  0.2480  1.0964  3.4137

We want our median value to be centered around zero as this tells that are residuals are somewhat symmetrical and that model is predicting evenly at both high and low ends of our dataset. Our output shows that our model is quite symmetrical and is predicting both high and low prices. This can be further confirmed using quantile-quantile plot.

plot(mul_model)

#COefficents According the coefficients the equation is like price = 1.31 + (-0.02) * KPK + (-0.03) * punjab + (-0.01) * sindh + (0.02) * lahore + 0.31 * categorymeat, fish and egg + (-0.07)* categoryoil and fats + 0.3 * categorypulses and nuts

#Standard Error of Coefficient Standard error of coeff is an estimate of the standard deviaton of the coeff. This tells how much uncertainity there is with our coeff. The SE is often used to create confidence interval

#T-value T value is quick way to check if coeffient is signicant or not. T-statistic is coeff divided by SE.In general we want coeff to have large t-statistics this indicate that our SE is small in compariseon to our coeff. The larger T-statistic is, the more certain we can be that the coeff is not zero. The t-statistic is used to find p-value.

#p-value The p-value is calculated using the t-statistic from the T distribution. The p-value, in association with the t-statistic, help us to understand how significant our coefficient is to the model. In practice, any p-value below 0.05 is usually deemed as significant.It means we are confident that the coefficient is not zero, meaning the coefficient does in fact add value to the model by helping to explain the variance within our dependent variable. In our model, we can see that the p-values for the Intercept and points are extremely small. This leads us to conclude that there is strong evidence that the coefficients in this model are not zero.

#Residual Standard Error RSE The residual standard error is a measure of how well the model fits the data.In general we want the smaller the residual standard error possible, because that means our model’s prediction line is very close to actual values on average. In our model the actual values are 0.123 away from the predicted values or regression line.

#R-Squared The Multiple R-squared value is most often used for simple linear regression (one predictor). It tells us what percentage of the variation within our dependent variable that the independent variable is explaining. In other words, it’s another method to determine how well our model is fitting the data.In our model points explain 62% of the variation withing the price, our dependant variable.

#Adjusted R-Square The Adjusted R-squared value is used when running multiple linear regression and can conceptually be thought of in the same way we described Multiple R-squared. The Adjusted R-squared value shows what percentage of the variation within our dependent variable that all predictors are explaining. The difference between these two metrics is a nuance in the calculation where we adjust for the variance attributed by adding multiple variables.

#F-statistic and p-value When running a regression model, either simple or multiple, a hypothesis test is being run on the global model. The null hypothesis is that there is no relationship between the dependent variable and the independent variable(s) and the alternative hypothesis is that there is a relationship. Said another way, the null hypothesis is that the coefficients for all of the variables in your model are zero. The alternative hypothesis is that at least one of them is not zero.The F-statistic and overall p-value help us determine the result of this test. Looking at the F-statistic alone can be a little misleading depending on how many variables are in your test. If you have a lot of independent variables, it’s common for an F-statistic to be close to one and to still produce a p-value where we would reject the null hypothesis. However, for smaller models, a larger F-statistic generally indicates that the null hypothesis should be rejected. A better approach is to utilize the p-value that is associated with the F-statistic. Again, in practice, a p-value below 0.05 generally indicates that you have at least one coefficient in your model that isn’t zero. We can see from our model, the F-statistic is very large and our p-value is so small it is basically zero. This would lead us to reject the null hypothesis and conclude that there is strong evidence that a relationship does exist between salary and points.

#Choosing important vairales

#relative importance of variables
# relative_imp <- calc.relimp(mul_model, type = "lmg", rela = TRUE)
# 
# sort(relative_imp$lmg, decreasing = TRUE)

#Stepwise regression Stepwise regression provides a number of stopping rules for selecting the best subset of variables for our model. In stepwise selection, variables are added to or deleted from a model one at a time, until some stopping criterion is reached.

# mul_model1 <- lm(price ~ admname + mktname + category, data = food_new)
# 
# mul_model2 <- lm(price ~ admname + mktname, data = food_new)
# 
# mul_model3 <- lm(price ~ category, data = food_new)
# 
# com_mod <- compareLM(mul_model1, mul_model2)
# com_mod
mul_model <- lm(price ~ admname + mktname + category, data = food_new)
stepAIC(mul_model, direction = "backward")
## Start:  AIC=-8377.45
## price ~ admname + mktname + category
## 
## 
## Step:  AIC=-8377.45
## price ~ mktname + category
## 
##            Df Sum of Sq    RSS     AIC
## <none>                  1334.7 -8377.4
## - mktname   4     11.19 1345.9 -8337.5
## - category  4   2746.53 4081.2 -1959.8
## 
## Call:
## lm(formula = price ~ mktname + category, data = food_new)
## 
## Coefficients:
##                 (Intercept)                mktnameLahore  
##                     3.58110                     -0.01490  
##               mktnameMultan              mktnamePeshawar  
##                    -0.07882                     -0.03524  
##               mktnameQuetta  categorymeat, fish and eggs  
##                     0.05805                      1.48223  
##  categorymiscellaneous food         categoryoil and fats  
##                    -0.08033                      1.68697  
##     categorypulses and nuts  
##                     1.42457

#Multicollinearity, Variation Inflation Factor (VIF)¶ Multicollinearity(or substantial correlation) generally occurs when there are high correlations between two or more predictor variables. In other words, one predictor variable can be used to predict the other. For example: a person’s hight and weight Not as before methods, the VIF checks the model.

As a general rule of thumb, a VIF for a predictor greater than 5 or 10 indicates that multicollinearity. or square-root(VIF) > 2 indicates multicollinearity problem.

# library(car)
# vif(mul_model1)
# 
# #to predict which variables are perfectly correlated
# cor(food_new)

#best subsets
#subsets <- regsubsets(price ~ ., data = food_new, nbest = 2)

#check model assumptions we can create a histogram of residuals from a linear model. The distribution of these residuals should be approximately normal. We can find the residuals of a model by using residuals() function.

mod_re <- residuals(mul_model)
hist(mod_re)

The distribution is right skewed slightly.

#Check the linearity between residuals and predicted values. The residuals should be unbiased and homoscedastic

par(mfrow = c(2,2))
plot(mul_model)

food <- read.csv("pk_food_prices.csv")
food$price <- as.numeric(food$price)
## Warning: NAs introduced by coercion
food$price <- log(food$price) 

#create training and Test data  
trainingrowindex <- sample(1:nrow(food_new), 0.7 * nrow(food_new))
trainingData <- food_new[trainingrowindex, ] #train data
testData <- food_new[-trainingrowindex, ]   #test data

#build model on training data
model1 <- lm(price ~ admname + mktname + category + itemname, data = trainingData)

#predict
pred1 <- predict(model1, testData)
## Warning in predict.lm(model1, testData): prediction from a rank-deficient fit
## may be misleading
actuals_pred1 <- data.frame(cbind(actuals = testData$price, predicted = pred1))

cor(actuals_pred1)
##             actuals predicted
## actuals   1.0000000 0.9095486
## predicted 0.9095486 1.0000000