1 Data Introduction

  This data is from Kaggle. This data set includes hourly air pollutants data from Tiantan nationally-controlled air-quality monitoring sites. The air-quality data are from the Beijing Municipal Environmental Monitoring Center. The meteorological data in each air-quality site are matched with the nearest weather station from the China Meteorological Administration. The time period is from March 1st, 2013 to February 28th, 2017. Missing data are denoted as NA.

1.1 Source

@ Web page in UCI Machine Learning Repository

1.2 Abstract

  This hourly data set considers 6 main air pollutants and 6 relevant meteorological variables at multiple sites in Beijing.

Here is the abstract of the data set:

  • Data Set Characteristics: Multivariate, Time-Series
  • Number of Instances: 420768
  • Area: Physical
  • Attribute Characteristics: Integer, Real
  • Number of Attributes: 18
  • Date Donated: 2019-09-20
  • Associated Tasks: Regression

2 Data Exploratory Analysis

Language: R
coding: utf-8

2.1 Import Library

library(ggplot2)
library(plyr)
library(utils)

2.2 Import Data Set

data = read.csv('data_tiantan.csv')
names(data)
##  [1] "No"      "year"    "month"   "day"     "hour"    "PM2.5"   "PM10"   
##  [8] "SO2"     "NO2"     "CO"      "O3"      "TEMP"    "PRES"    "DEWP"   
## [15] "RAIN"    "wd"      "WSPM"    "station"

  The data set has 18 attributes, but we only use 1st to 14th attributes, so we need to split it:

use_data = data[,c(1:14)]
names(use_data)
##  [1] "No"    "year"  "month" "day"   "hour"  "PM2.5" "PM10"  "SO2"   "NO2"  
## [10] "CO"    "O3"    "TEMP"  "PRES"  "DEWP"

  Show the detailed information of this data set:

summary(use_data)
##        No             year          month             day       
##  Min.   :    1   Min.   :2013   Min.   : 1.000   Min.   : 1.00  
##  1st Qu.: 8767   1st Qu.:2014   1st Qu.: 4.000   1st Qu.: 8.00  
##  Median :17533   Median :2015   Median : 7.000   Median :16.00  
##  Mean   :17533   Mean   :2015   Mean   : 6.523   Mean   :15.73  
##  3rd Qu.:26298   3rd Qu.:2016   3rd Qu.:10.000   3rd Qu.:23.00  
##  Max.   :35064   Max.   :2017   Max.   :12.000   Max.   :31.00  
##                                                                 
##       hour           PM2.5             PM10            SO2          
##  Min.   : 0.00   Min.   :  3.00   Min.   :  2.0   Min.   :  0.5712  
##  1st Qu.: 5.75   1st Qu.: 22.00   1st Qu.: 41.0   1st Qu.:  3.0000  
##  Median :11.50   Median : 59.00   Median : 85.0   Median :  7.0000  
##  Mean   :11.50   Mean   : 82.16   Mean   :106.4   Mean   : 14.3676  
##  3rd Qu.:17.25   3rd Qu.:113.00   3rd Qu.:144.0   3rd Qu.: 17.0000  
##  Max.   :23.00   Max.   :821.00   Max.   :988.0   Max.   :273.0000  
##                  NA's   :677      NA's   :597     NA's   :1118      
##       NO2               CO              O3                TEMP       
##  Min.   :  2.00   Min.   :  100   Min.   :  0.4284   Min.   :-16.80  
##  1st Qu.: 28.00   1st Qu.:  500   1st Qu.:  8.0000   1st Qu.:  3.10  
##  Median : 47.00   Median :  900   Median : 40.0000   Median : 14.60  
##  Mean   : 53.16   Mean   : 1298   Mean   : 55.9843   Mean   : 13.67  
##  3rd Qu.: 71.00   3rd Qu.: 1600   3rd Qu.: 81.0000   3rd Qu.: 23.50  
##  Max.   :241.00   Max.   :10000   Max.   :674.0000   Max.   : 41.10  
##  NA's   :744      NA's   :1126    NA's   :843        NA's   :20      
##       PRES             DEWP        
##  Min.   : 987.1   Min.   :-35.300  
##  1st Qu.:1004.0   1st Qu.: -8.800  
##  Median :1012.2   Median :  3.000  
##  Mean   :1012.5   Mean   :  2.448  
##  3rd Qu.:1020.9   3rd Qu.: 15.000  
##  Max.   :1042.0   Max.   : 28.800  
##  NA's   :20       NA's   :20

2.3 Choose one attribute

  Here we choose “PM2.5” as the variable for our exploring research, because it is numerical value and easy to operate and observe.

data0 = na.omit(use_data[,'PM2.5'])                       # Delete the records with missed value.
N = length(data0)                                         # The length of variable.
data1 = cut(data0, breaks = c(0, (1:15)*50, max(data0)))  # Grouping variable.

2.4 Observe the group information

2.4.1 Group tentatively

data1 = data.frame(data1)                                   # Convert data into dataframe
dataf = ddply(data1, "data1", summarise, n = length(data1)) # Statistical grouping information
dataf$组号 = c(1:length(dataf[,1]))                           # Add group code
ggplot(dataf, aes(x = 组号, y = n)) + geom_bar(stat = "identity", fill = 'red') + 
  geom_text(aes(label = n), color = "black", vjust = -0.3) + 
  labs(title = "Exponent of PM2.5") + xlab('Group Code') + ylab('Counts')

  We find the distribution is strongly skewed, so we need to adjust the group gap to make it normal.

2.4.2 Adjust the group gap

data2 = cut(data0, breaks = c(0,10,25,(1:5)*50, max(data0)))
data2 = data.frame(data2)
dataf2 = ddply(data2, "data2", summarise, n = length(data2))
dataf2$组号 = c(1:length(dataf2[,1]))
ggplot(dataf2, aes(x = 组号, y = n)) + geom_bar(stat = "identity", fill = 'red') + 
  geom_text(aes(label = n), color = "black", vjust = -0.3) + 
  labs(title = "Exponent of PM2.5") + xlab('Group Code') + ylab('Counts')

  Now this distribution looks great!

2.5 Caculate frequency

PD = table(data2)/N
PD
## data2
##     (0,10]    (10,25]    (25,50]   (50,100]  (100,150]  (150,200]  (200,250] 
## 0.10797685 0.16811586 0.17471719 0.25498008 0.13540001 0.07249833 0.04045133 
##  (250,821] 
## 0.04586035

2.6 Sample tentatively

  In order to make sure the code(especially the function we write) we used could run without bugs, we can sample tentatively before starting.

2.6.1 Sampling function

func1 = function(i){
  set.seed(1)
  p = sample(data0, i)
  p = c(p, matrix(NA, 1, max(samp) - length(p)))
  return (p)
}

2.6.2 Function calculating the sample quality

func2 = function(datasamp){
  datasamp_1 = cut(na.omit(datasamp), breaks = c(0,10,25,(1:5)*50,max(data0)))
  PS = table(datasamp_1)/length(na.omit(datasamp)) + 0.0000001
  J = sum((PS-PD) * log(PS/PD))
  q = exp(-J)
  return(q)
}

2.6.3 Show the result

samp = c(100,1000,5000,10000)
n = length(samp)
samp = as.matrix(samp)   # Function apply require the type of data should be "matrix"
ma = apply(samp, 1, func1)
Q1 = apply(ma, 2, func2)
Q1
## [1] 0.9641881 0.9884670 0.9991390 0.9993164

2.7 Select the optimal sample

2.7.1 Calculate the quality of different sample size

x = seq(6,15,by = 0.1)
y = 2^(x)
plot(x, y, ylab = "Sample size", xlab = "Variable X", main = "Choose the sample size", 
     col = "blue" )

head(y, 20)
##  [1]  64.00000  68.59350  73.51669  78.79324  84.44851  90.50967  97.00586
##  [8] 103.96831 111.43047 119.42822 128.00000 137.18700 147.03339 157.58648
## [15] 168.89701 181.01934 194.01172 207.93661 222.86094 238.85645
samp = round(y)[-c(1:10)]                 # The first 10 sample size chages too slowly
n = length(samp)
samp = as.matrix(samp)
ma2 = apply(samp, 1, func1)
Q2 = apply(ma2, 2, func2) 
Q2
##  [1] 0.9749737 0.9799594 0.9816330 0.9777160 0.9853738 0.9830940 0.9842644
##  [8] 0.9891409 0.9872381 0.9814159 0.9846753 0.9764744 0.9785833 0.9813134
## [15] 0.9835726 0.9837194 0.9848994 0.9866820 0.9874570 0.9888718 0.9860133
## [22] 0.9814066 0.9788355 0.9775980 0.9844250 0.9878079 0.9894761 0.9877285
## [29] 0.9900763 0.9899412 0.9880280 0.9895247 0.9911530 0.9920905 0.9912475
## [36] 0.9910827 0.9922382 0.9915958 0.9944383 0.9945780 0.9950977 0.9959123
## [43] 0.9963870 0.9965372 0.9972032 0.9971778 0.9972605 0.9979517 0.9977511
## [50] 0.9983812 0.9984424 0.9986562 0.9990817 0.9992554 0.9991797 0.9992841
## [57] 0.9992680 0.9992259 0.9994226 0.9993486 0.9995607 0.9995796 0.9993612
## [64] 0.9992324 0.9994255 0.9994833 0.9995162 0.9995781 0.9995975 0.9997292
## [71] 0.9996722 0.9997684 0.9998087 0.9998811 0.9998771 0.9999015 0.9998631
## [78] 0.9998943 0.9999429 0.9999883 0.9999929
plot(samp, Q2, xlab = 'sample size', ylab = 'sample quality', 
     main = 'Select the optimal sample size',col = 'blue')

2.7.2 Plot the figure

df_input = data.frame(samp, Q2)
ggplot(df_input, aes(x = samp, y = Q2)) + geom_point(size = 3) + 
  geom_hline(yintercept = 0.99, color = 'red') + 
  annotate("text", x = 20000, y = 0.993, label = "sample_quality = 0.99", 
           fontface = "italic", color = 'blue') +
  annotate("segment", x = 20000, xend = 15000, y = 0.992, yend = 0.990, 
           arrow = arrow(), color= 'blue', size = 1)

3 Other Work (outside of assignment)

  Apart from the assignment, I also tried the other code in our course book.

# 3.2
train_data = read.csv('train.csv', header = TRUE, fileEncoding = 'UTF-8')
# Generate equidistant points -- Vector
x = seq(6,20,by = 0.1) 
y = 2^(x)
plot(x,y)

# Rounded
samp = round(y)
# elete data in 1:51,69:90
samp = samp[-c(1:51)][-c(69:90)]
n = length(samp)

n1 = length(train_data[,1])
data0 = na.omit(train_data[,'Age'])
N = length(data0)
# Attribute Age - 177 data missing
missing_count_age = n1 - N 
rate1 = N/n1 ; rate2 = missing_count_age/n1
pie(c(missing_count_age,N),labels = c('missed\n177 (19.9%)','non_missed \n714 (80.1%)'),
    col = rainbow(2, alpha = 0.2))+
  title(main = 'The missing value proportion diagram')

## integer(0)
# data1 -> Delimit each variable into its corresponding interval and return 
#                                        the region to which it belongs
data1 = cut(data0, breaks = c(0, (1:7)*10, max(data0)))
View(data1)
# table function used to count
PD = table(data1)/N # calculate frequency
View(PD)

hist1 = hist(data0, breaks = 10, xlab = 'Age', ylab = 'counts', col = rainbow(8, alpha = 0.4),
             main = 'TitanicAge distribution histogram')

# rep -- > repeat
Q = rep(0,n)
View(Q)
J = NULL

## Simple random sampling

set.seed(1)
# There are only 714 pieces of data, so you can't do anything in the book
# Rewrite the samp
samp = c(1:7)*100
samp = as.matrix(samp)
n = length(samp)

func1 = function(count){
  p = sample(data0, count)
  p = c(p, matrix(NA, 1 ,samp[n] - length(p)))
  return (p)
}

# apply -> Samp is mapped into the variables through Func1, 
#   with the second variable 1 being mapped by rows and 2 by columns
ma = apply(samp, 1, func1)

func2 = function(data_sample){
  # Minus NA -> data_sample_1
  data_sample_1 = cut(na.omit(data_sample), breaks = c(0, (1:7)*10, max(data0))) 
  # Notice that the data0 here means the population distribution
  PS = table(data_sample_1)/length(na.omit(data_sample)) + 0.0000000001
  J = sum( (PS-PD) * (log(PS/PD)) )
  q = exp(-J)
  return (q)
}
# Output data quality values
Q1 = apply(ma, 2, func2)

barplot(Q1, names.arg = c(1:7)*100, col = rainbow(7, alpha = 0.4), 
        xlab = 'Sample size', ylab='Sample quality', ylim = c(0,1)) +
  title('Sample quality change')

## numeric(0)
## Stratified sampling

str = length(levels(data1))
View(data1)
# Merge column vectors. Note that when you merge, data1 columns 
#   no longer hold intervals, but indexes of intervals
data2 = cbind(data0, data1)

func3 = function(s){
  p = NULL
  for(j in 1:str){
    samp2 = NULL
    samp2 = sample( (1:N)[data2[,2] == j], round(s*PD[j]) )  
    #(1:N)[data2[,2] == j] Equivalent to return index
    p = c(p, samp2)
  }
  res = c(data0[p], matrix(NA, 1, samp[n] + 5 - length(p)) )
  # Why? + 5 - len(p)
  # Explanation:round -> When taking an integer may abandon 0.4, 
  #   so need to do a +5 processing, to prevent the occurrence of negative numbers!!
  print(length(p))
  return(res)
}

mb = apply(samp, 1, func3)
## [1] 100
## [1] 199
## [1] 300
## [1] 401
## [1] 501
## [1] 599
## [1] 700
Q2 = apply(mb, 2, func2)

barplot(Q2, names.arg = c(1:7)*100, col = rainbow(7, alpha = 0.4), 
        xlab = 'Sample size', ylab='Sample quality', ylim = c(0,1)) +
  title('Sample quality change')

## numeric(0)
# 3.3  Probability sampling

# clear work space
rm(list = ls())
library(sampling)

# Simple random sampling
data = read.csv('train.csv', header = TRUE)
names(data)
##  [1] "PassengerId" "Survived"    "Pclass"      "Name"        "Sex"        
##  [6] "Age"         "SibSp"       "Parch"       "Ticket"      "Fare"       
## [11] "Cabin"       "Embarked"
# Missing value information
nap = which(is.na(data), arr.ind = TRUE)
miss = nap[,1]
# Deletes rows with missing data
data11 = data
data12 = data11[-miss,]

# The total number
N = dim(data)[1]
# Select the number of samples
n = 500
# srswor:never put it back ; srswr:always put it back
srsp = srswor(n, N)  # Pay Attention:return index
srs = getdata(data, srsp)
#View(srs)

length(srs[,1])
## [1] 500
meanY = colMeans(data[,c(6,10)])
meany = colMeans(srs[,c(7,11)])    
#A column of serial numbers is automatically generated, corresponding to 
#   the index in the original sample
error = meanY - meany

# Stratified sampling
data$Pclass = factor(data$Pclass, levels = as.character(1:3)) 
  #Convert to the Factor class
weights = n * table(data$Pclass)/N
order = order(data$Pclass)
srp = strata(data = data[order,], stratanames = "Pclass", 
             size = weights, method = "srswor")
sr = getdata(data, srp)

# Cluster sampling
scp = cluster(data = data, clustername = "SibSp", size = 4, method = 'srswor',
              description = FALSE)
sc = getdata(data, scp)                

# Systematic sampling
i = rep(1, N) 
pik1 = inclusionprobabilities(i, n) # Take n samples with equal probability       
# View(pik1)
ssp = UPsystematic(pik1, eps = 1e-6)  #eps control the range of pik1 in (eps,1-eps)
ss = getdata(data, ssp)

# Multistage sampling
msp = mstage(data = data, stage = c("cluster","cluster"), 
             varnames = list("SibSp","Pclass"), 
             size = list(4,1), method = c("srswor","srswor"),description = FALSE)
ms = getdata(data,msp)                
mss = ms[[2]]                

# Unequal probability sampling
# The inclusion probability is calculated based on the size of Fare
Fare = data$Fare
# View(Fare)
#Avoid the appearance of zero 0
Fare = Fare + 1
pik2 = inclusionprobabilities(Fare, n)
usp = UPmidzuno(pik2)
us = getdata(data, usp)

# Double sampling
# Two simple random sampling here
## first sampling
srsp1 = srswor(700, N)
srs1 = getdata(data, srsp1)
## second sampling
srsp2 = srswor(n, 700)
srs2 = getdata(srs1, srsp2)