soil_df <- fread("../Soil_Nutrient_Year.csv", header = T, stringsAsFactors = F, showProgress = T)
soil_df$V1 <- NULL
summary(soil_df)
## DistrictId DistrictName BlockId BlockName
## Min. :69.00 Length:685478 Min. : 447.0 Length:685478
## 1st Qu.:74.00 Class :character 1st Qu.: 483.0 Class :character
## Median :78.00 Mode :character Median : 515.0 Mode :character
## Mean :77.97 Mean : 615.4
## 3rd Qu.:81.00 3rd Qu.: 539.0
## Max. :89.00 Max. :6778.0
##
## SoilPh ElectricalConductivity OrganicCarbon
## Min. : 0.010 Min. :0.000 Min. : 0.0000
## 1st Qu.: 7.580 1st Qu.:0.340 1st Qu.: 0.2500
## Median : 7.810 Median :0.520 Median : 0.3200
## Mean : 7.992 Mean :0.678 Mean : 0.4949
## 3rd Qu.: 8.100 3rd Qu.:0.800 3rd Qu.: 0.3900
## Max. :96.000 Max. :5.000 Max. :97.1400
## NA's :6319 NA's :7917 NA's :1093
## Nitrogen Phosphorous Potassium Sulphur
## Min. : 0.1 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 61.4 1st Qu.: 4.90 1st Qu.: 93.39 1st Qu.: 19.80
## Median : 77.9 Median : 8.70 Median : 172.00 Median : 44.48
## Mean : 82.6 Mean : 16.70 Mean : 238.89 Mean : 71.58
## 3rd Qu.: 97.1 3rd Qu.: 14.49 3rd Qu.: 326.17 3rd Qu.: 92.98
## Max. :9714.0 Max. :9890.00 Max. :9980.00 Max. :999.00
## NA's :659896 NA's :2639 NA's :2643 NA's :30037
## Zinc Iron Copper Magnesium
## Min. : 0.000 Min. : 0.00 Min. : 0.0 Min. : 0.000
## 1st Qu.: 0.750 1st Qu.: 4.29 1st Qu.: 0.6 1st Qu.: 2.440
## Median : 1.370 Median : 7.77 Median : 1.1 Median : 4.740
## Mean : 3.423 Mean : 13.42 Mean : 2.9 Mean : 9.157
## 3rd Qu.: 2.520 3rd Qu.: 14.06 3rd Qu.: 2.1 3rd Qu.: 9.090
## Max. :996.000 Max. :999.00 Max. :984.7 Max. :999.000
## NA's :5764 NA's :5647 NA's :352636 NA's :9963
## Boron Year
## Min. : 0.0 Min. :2013
## 1st Qu.: 0.3 1st Qu.:2015
## Median : 1.4 Median :2015
## Mean : 18.0 Mean :2015
## 3rd Qu.: 10.1 3rd Qu.:2015
## Max. :937.6 Max. :2016
## NA's :680505
table(is.na(soil_df$SoilPh))
##
## FALSE TRUE
## 679159 6319
table(soil_df$SoilPh>14) # 1652 records have incorrect data (<1%)
##
## FALSE TRUE
## 677507 1652
6319 is missing data (<1%).
1652 rows have incorrect data, >14.
Average value: 7.9917812
ggplot(data=soil_df[soil_df$SoilPh<=14,], aes(x=Year, y=SoilPh, fill=as.factor(Year))) + geom_boxplot()
table(is.na(soil_df$ElectricalConductivity))
##
## FALSE TRUE
## 677561 7917
7917 missing value in EC (~1%).
ggplot(data=soil_df, aes(x=as.factor(Year), y=ElectricalConductivity, fill=as.factor(Year))) + geom_boxplot()
## Warning: Removed 7917 rows containing non-finite values (stat_boxplot).
Looking at the historgram
ggplot(soil_df) + aes(x = ElectricalConductivity) + geom_histogram(bins = 5, colour ="black", fill="#FF6666")
## Warning: Removed 7917 rows containing non-finite values (stat_bin).
Missing values
table(is.na(soil_df$OrganicCarbon))
##
## FALSE TRUE
## 684385 1093
1093 missing vales. Mean value: 0.494917.
OC above 1 vs less than 1
table(soil_df$OrganicCarbon<=1)
##
## FALSE TRUE
## 4850 679535
Only 0.7086654% of OC value is above 1. Some treatment is required.
temp <- soil_df[soil_df$OrganicCarbon<=1,]
ggplot(data=temp, aes(x=as.factor(Year), y=OrganicCarbon, fill=as.factor(Year))) + geom_boxplot()
rm(temp)
Missing Values
table(is.na(soil_df$Nitrogen))
##
## FALSE TRUE
## 25582 659896
Majority of data is missing for Nitrogen. (96.2680057%)
HYPOTHESIS: Organic Compound (OC) can be used as the proxy variable for Nitrogen.
data_nit_oc <- soil_df[is.na(soil_df$Nitrogen)==F & is.na(soil_df$OrganicCarbon)==F,]
summary(lm(Nitrogen ~ OrganicCarbon, data_nit_oc))
##
## Call:
## lm(formula = Nitrogen ~ OrganicCarbon, data = data_nit_oc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -125.7 -21.7 -5.2 14.0 9630.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.0228 0.8120 102.246 <2e-16 ***
## OrganicCarbon 0.4818 0.5842 0.825 0.409
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 126.4 on 25412 degrees of freedom
## Multiple R-squared: 2.677e-05, Adjusted R-squared: -1.258e-05
## F-statistic: 0.6803 on 1 and 25412 DF, p-value: 0.4095
Correlation value: 0.0051742.
Both R Square and Correlation clearly suggests that Nitrogen Value and OC values are not correlated.
Clearly, the hypothesis is incorrect.
With a very high missing values for Nitrogen, we can easily remove the variable.
Missing Values
table(is.na(soil_df$Phosphorous))
##
## FALSE TRUE
## 682839 2639
Low Missing Value 0.3849868%.
Mean 16.7013626.
Box Plot
ggplot(data=soil_df[is.na(soil_df$Phosphorous)==F,], aes(x=as.factor(Year), y=Phosphorous, fill=as.factor(Year))) + geom_boxplot()
upper_lmt <- 14.49 + (1.5*IQR(soil_df$Phosphorous, na.rm = T)) #Taken from Summary
table(soil_df$Phosphorous<=upper_lmt)
##
## FALSE TRUE
## 70340 612499
Approximately, 11% of above the outlier range, which is very high.
Box Plot on log transformation
ggplot(data=soil_df[is.na(soil_df$Phosphorous)==F,], aes(x=as.factor(Year), y=log(Phosphorous), fill=as.factor(Year))) + geom_boxplot()
## Warning: Removed 66 rows containing non-finite values (stat_boxplot).
The data looks normally distributed on log transformation.
Missing Values
table(is.na(soil_df$Potassium))
##
## FALSE TRUE
## 682835 2643
Low Missing Value 0.3855704%.
Mean 238.8920928.
Box Plot
ggplot(data=soil_df[is.na(soil_df$Potassium)==F,], aes(x=as.factor(Year), y=Potassium, fill=as.factor(Year))) + geom_boxplot()
Box Plot on log transformation
ggplot(data=soil_df[is.na(soil_df$Potassium)==F,], aes(x=as.factor(Year), y=log(Potassium), fill=as.factor(Year))) + geom_boxplot()
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
Log transformation is highly normal distributed.
Missing Values
table(is.na(soil_df$Sulphur))
##
## FALSE TRUE
## 655441 30037
Low Missing Value 4.3819058%.
Mean 71.5763594.
Box Plot
ggplot(data=soil_df[is.na(soil_df$Sulphur)==F,], aes(x=as.factor(Year), y=Sulphur, fill=as.factor(Year))) + geom_boxplot()
Box Plot on log transformation
ggplot(data=soil_df[is.na(soil_df$Sulphur)==F,], aes(x=as.factor(Year), y=log(Sulphur), fill=as.factor(Year))) + geom_boxplot()
## Warning: Removed 54 rows containing non-finite values (stat_boxplot).
Log transformation is normal distributed with a lttle skewness on left side.
Missing Values
table(is.na(soil_df$Zinc))
##
## FALSE TRUE
## 679714 5764
Low Missing Value 0.8408731%.
Mean 3.4232712.
Box Plot
ggplot(data=soil_df[is.na(soil_df$Zinc)==F,], aes(x=as.factor(Year), y=Zinc, fill=as.factor(Year))) + geom_boxplot()
Lot of outliers.
Box Plot on log transformation
ggplot(data=soil_df[is.na(soil_df$Zinc)==F,], aes(x=as.factor(Year), y=log(Zinc), fill=as.factor(Year))) + geom_boxplot()
## Warning: Removed 1666 rows containing non-finite values (stat_boxplot).
Log transformation is normal distributed.
Missing Values
table(is.na(soil_df$Iron))
##
## FALSE TRUE
## 679831 5647
Low Missing Value 0.8238047%.
Mean 13.4236729.
Box Plot
ggplot(data=soil_df[is.na(soil_df$Iron)==F,], aes(x=as.factor(Year), y=Iron, fill=as.factor(Year))) + geom_boxplot()
Again lot of outliers.
Box Plot on log transformation
ggplot(data=soil_df[is.na(soil_df$Iron)==F,], aes(x=as.factor(Year), y=log(Iron), fill=as.factor(Year))) + geom_boxplot()
## Warning: Removed 513 rows containing non-finite values (stat_boxplot).
Log transformation is normal distributed.
Missing Values
table(is.na(soil_df$Copper))
##
## FALSE TRUE
## 332842 352636
Very high Missing Value 51.44381%.
NEED TO DISCUSS
Mean 2.8994019.
Box Plot
ggplot(data=soil_df[is.na(soil_df$Copper)==F,], aes(x=as.factor(Year), y=Copper, fill=as.factor(Year))) + geom_boxplot()
Again lot of outliers.
Box Plot on log transformation
ggplot(data=soil_df[is.na(soil_df$Copper)==F,], aes(x=as.factor(Year), y=log(Copper), fill=as.factor(Year))) + geom_boxplot()
## Warning: Removed 561 rows containing non-finite values (stat_boxplot).
Log transformation is normal distributed.
Missing Values
table(is.na(soil_df$Magnesium))
##
## FALSE TRUE
## 675515 9963
Missing Value of 1.4534383% is a little high but impuation should be fine. NEED TO DISCUSS.
Mean 9.1571879.
Box Plot
ggplot(data=soil_df[is.na(soil_df$Magnesium)==F,], aes(x=as.factor(Year), y=Magnesium, fill=as.factor(Year))) + geom_boxplot()
Again lot of outliers.
Box Plot on log transformation
ggplot(data=soil_df[is.na(soil_df$Magnesium)==F,], aes(x=as.factor(Year), y=log(Magnesium), fill=as.factor(Year))) + geom_boxplot()
## Warning: Removed 1796 rows containing non-finite values (stat_boxplot).
Log transformation is normal distributed.
Missing Values
table(is.na(soil_df$Boron))
##
## FALSE TRUE
## 4973 680505
Almost all values are missing. Missing Value in 99.2745208% of data. NEED TO REMOVE THIS feature.