library(readr)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
##
## extract
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(ggplot2)
library(outliers)
Agri_Land_India <- read_csv("D:/MS Analytics/Data Wrangling/Assignment 2/datasets_19581_25424_Indias Agricultural land details.csv")
## Parsed with column specification:
## cols(
## `States/UTs` = col_character(),
## Agriculturalland = col_double()
## )
head(Agri_Land_India)
Agri_Harvest_Winnow_thresh <- read_csv("D:/MS Analytics/Data Wrangling/Assignment 2/datasets_19581_25424_Indian states agricultural harvesting winnowing threshing rate.csv")
## Parsed with column specification:
## cols(
## Particulars = col_character(),
## States = col_character(),
## Gender = col_character(),
## `2011` = col_double(),
## `2012` = col_double(),
## `2013` = col_double(),
## `2014` = col_double()
## )
head(Agri_Harvest_Winnow_thresh)
str(Agri_Land_India)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 28 obs. of 2 variables:
## $ States/UTs : chr "Andhra Pradesh" "Arunachal Pradesh" "Assam" "Bihar" ...
## $ Agriculturalland: num 15930 424 3217 6582 5552 ...
## - attr(*, "spec")=
## .. cols(
## .. `States/UTs` = col_character(),
## .. Agriculturalland = col_double()
## .. )
str(Agri_Harvest_Winnow_thresh)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 32 obs. of 7 variables:
## $ Particulars: chr "Harvesting, Winnowing and Threshing" "Harvesting, Winnowing and Threshing" "Harvesting, Winnowing and Threshing" "Harvesting, Winnowing and Threshing" ...
## $ States : chr "Andhra Pradesh" "Assam" "Bihar" "Gujarat" ...
## $ Gender : chr "Men" "Men" "Men" "Men" ...
## $ 2011 : num 214 181 184 133 339 ...
## $ 2012 : num 237 179 173 157 322 ...
## $ 2013 : num 232 185 178 163 316 ...
## $ 2014 : num 239 191 185 164 310 ...
## - attr(*, "spec")=
## .. cols(
## .. Particulars = col_character(),
## .. States = col_character(),
## .. Gender = col_character(),
## .. `2011` = col_double(),
## .. `2012` = col_double(),
## .. `2013` = col_double(),
## .. `2014` = col_double()
## .. )
Agri_Harvest_Winnow_thresh$Gender <- factor(Agri_Harvest_Winnow_thresh$Gender, levels = c("Men","Women"), labels = c(1,0))
levels(Agri_Harvest_Winnow_thresh$Gender)
## [1] "1" "0"
is.factor(Agri_Harvest_Winnow_thresh$Gender)
## [1] TRUE
colSums(is.na(Agri_Land_India))
## States/UTs Agriculturalland
## 0 0
colSums(is.na(Agri_Harvest_Winnow_thresh))
## Particulars States Gender 2011 2012 2013
## 0 0 0 0 0 0
## 2014
## 1
Agri_Harvest_Winnow_thresh %>% summary()
## Particulars States Gender 2011 2012
## Length:32 Length:32 1:17 Min. :126.3 Min. :135.8
## Class :character Class :character 0:15 1st Qu.:161.6 1st Qu.:158.8
## Mode :character Mode :character Median :201.6 Median :194.8
## Mean :220.6 Mean :219.7
## 3rd Qu.:252.0 3rd Qu.:241.0
## Max. :501.1 Max. :501.1
##
## 2013 2014
## Min. :135.6 Min. :133.9
## 1st Qu.:163.4 1st Qu.:165.8
## Median :197.3 Median :191.9
## Mean :222.6 Mean :211.9
## 3rd Qu.:240.8 3rd Qu.:236.7
## Max. :501.1 Max. :379.2
## NA's :1
Agri_Harvest_Winnow_thresh$`2014`[is.na(Agri_Harvest_Winnow_thresh$`2014`=="NA")] = 211.9
colSums(is.na(Agri_Harvest_Winnow_thresh))
## Particulars States Gender 2011 2012 2013
## 0 0 0 0 0 0
## 2014
## 0
Agri_Harvest1=Agri_Harvest_Winnow_thresh
Agri_Harvest1 <- Agri_Harvest_Winnow_thresh %>% gather('2011','2012','2013','2014', key = "Year", value = "Sqft_area")
head(Agri_Harvest1)
names(Agri_Land_India)
## [1] "States/UTs" "Agriculturalland"
State_rename <- rename(Agri_Land_India, States="States/UTs")
head(State_rename)
subset1 <- left_join(Agri_Harvest1, State_rename, by = "States")
head(subset1)
subset2=subset1
subset2 <- subset1 %>% mutate(Land_for_Harvest_Winnow_Threshing = Sqft_area/Agriculturalland*100)
head(subset2)
str(subset2)
## Classes 'tbl_df', 'tbl' and 'data.frame': 128 obs. of 7 variables:
## $ Particulars : chr "Harvesting, Winnowing and Threshing" "Harvesting, Winnowing and Threshing" "Harvesting, Winnowing and Threshing" "Harvesting, Winnowing and Threshing" ...
## $ States : chr "Andhra Pradesh" "Assam" "Bihar" "Gujarat" ...
## $ Gender : Factor w/ 2 levels "1","0": 1 1 1 1 1 1 1 1 1 1 ...
## $ Year : chr "2011" "2011" "2011" "2011" ...
## $ Sqft_area : num 214 181 184 133 339 ...
## $ Agriculturalland : num 15930 3217 6582 12661 3664 ...
## $ Land_for_Harvest_Winnow_Threshing: num 1.35 5.62 2.8 1.05 9.25 ...
subset2 %>% boxplot(Sqft_area~Year,data = .)
cap <- function(x){
quantiles <- quantile(x,c(0.05,0.25,0.75,0.95))
x[x<quantiles[2]-1.5*IQR(x)] <- quantiles [1]
x[x>quantiles[3]+1.5*IQR(x)] <- quantiles [4]
x
}
subset3 <- subset1 %>% group_by(Year) %>% mutate(Sqft_area=cap(Sqft_area))
head(subset2)
subset2$Sqft_area %>% hist(main="Histogram - Square foot area", col = "skyblue")
log10(subset2$Sqft_area) %>% hist(main="Histogram - Logarithm of Square foot area", col = "yellow")
NOTE: Note that sometimes the order of the tasks may be different than the order given here. For example, you may need to tidy the data sets first to be able to create the common key to merge. Therefore, for such cases you may have a different ordering of the sections.
Any further or optional pre-processing tasks can be added to the template using an additional section in the R Markdown file. Make sure your code is visible (within the margin of the page). Do not use View() to show your data, instead give headers (using head() )