setwd("C:/Users/walki/Documents/")
data<-read.csv("datasets.csv")
summary(data)
## Package Item Title Rows
## Length:1745 Length:1745 Length:1745 Min. : 2
## Class :character Class :character Class :character 1st Qu.: 35
## Mode :character Mode :character Mode :character Median : 108
## Mean : 3861
## 3rd Qu.: 601
## Max. :1414593
## Cols n_binary n_character n_factor
## Min. : 1.00 Min. : 0.00 Min. : 0.0000 Min. : 0.000
## 1st Qu.: 3.00 1st Qu.: 0.00 1st Qu.: 0.0000 1st Qu.: 0.000
## Median : 5.00 Median : 0.00 Median : 0.0000 Median : 0.000
## Mean : 13.02 Mean : 1.94 Mean : 0.3112 Mean : 1.291
## 3rd Qu.: 9.00 3rd Qu.: 2.00 3rd Qu.: 0.0000 3rd Qu.: 2.000
## Max. :6831.00 Max. :624.00 Max. :17.0000 Max. :64.000
## n_logical n_numeric CSV Doc
## Min. : 0.00000 Min. : 0.00 Length:1745 Length:1745
## 1st Qu.: 0.00000 1st Qu.: 2.00 Class :character Class :character
## Median : 0.00000 Median : 3.00 Mode :character Mode :character
## Mean : 0.03037 Mean : 11.34
## 3rd Qu.: 0.00000 3rd Qu.: 7.00
## Max. :11.00000 Max. :6830.00
preVals<-data[,4:5]
summary(preVals)
## Rows Cols
## Min. : 2 Min. : 1.00
## 1st Qu.: 35 1st Qu.: 3.00
## Median : 108 Median : 5.00
## Mean : 3861 Mean : 13.02
## 3rd Qu.: 601 3rd Qu.: 9.00
## Max. :1414593 Max. :6831.00
subrow<-data[,1:9]
new.Data<-subset(subrow,n_factor==0)
print(new.Data[1:5,])
## Package Item
## 2 AER ArgentinaCPI
## 4 AER BenderlyZwick
## 5 AER BondYield
## 8 AER ChinaIncome
## 9 AER CigarettesB
## Title Rows Cols
## 2 Consumer Price Index in Argentina 80 2
## 4 Benderly and Zwick Data: Inflation, Growth and Stock Returns 31 5
## 5 Bond Yield Data 60 2
## 8 Chinese Real National Income Data 37 5
## 9 Cigarette Consumption Data 46 3
## n_binary n_character n_factor n_logical
## 2 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 8 0 0 0 0
## 9 0 0 0 0
colnames(new.Data)<-c('Corp','Team','Project','R','C','NB','NC','NF','Lvl')
print(new.Data[1,])
## Corp Team Project R C NB NC NF Lvl
## 2 AER ArgentinaCPI Consumer Price Index in Argentina 80 2 0 0 0 0
summary(new.Data)
## Corp Team Project R
## Length:969 Length:969 Length:969 Min. : 2
## Class :character Class :character Class :character 1st Qu.: 27
## Mode :character Mode :character Mode :character Median : 84
## Mean : 2867
## 3rd Qu.: 445
## Max. :348532
## C NB NC NF
## Min. : 1.00 Min. : 0.000 Min. : 0.0000 Min. :0
## 1st Qu.: 2.00 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.:0
## Median : 4.00 Median : 0.000 Median : 0.0000 Median :0
## Mean : 15.91 Mean : 1.639 Mean : 0.5046 Mean :0
## 3rd Qu.: 8.00 3rd Qu.: 1.000 3rd Qu.: 0.0000 3rd Qu.:0
## Max. :6831.00 Max. :624.000 Max. :17.0000 Max. :0
## Lvl
## Min. : 0.00000
## 1st Qu.: 0.00000
## Median : 0.00000
## Mean : 0.03302
## 3rd Qu.: 0.00000
## Max. :11.00000
newVals<-new.Data[,4:5]
summary(newVals)
## R C
## Min. : 2 Min. : 1.00
## 1st Qu.: 27 1st Qu.: 2.00
## Median : 84 Median : 4.00
## Mean : 2867 Mean : 15.91
## 3rd Qu.: 445 3rd Qu.: 8.00
## Max. :348532 Max. :6831.00
summary(preVals)
## Rows Cols
## Min. : 2 Min. : 1.00
## 1st Qu.: 35 1st Qu.: 3.00
## Median : 108 Median : 5.00
## Mean : 3861 Mean : 13.02
## 3rd Qu.: 601 3rd Qu.: 9.00
## Max. :1414593 Max. :6831.00
We can see that the new Row’s Median has decreased by 24 and its Mean decreased by 994. This decrease is due the reduction of cells as there is only 969 cells in the subset compared to the original 1745 cells. The columns results mixed, as the new column’s Mean increased by 2.89 but its Median decreased by one point. The new column’s Mean increased is due to the subset N_logical which eliminated a few low numbers.
new.Data$Lvl<-as.character(new.Data$Lvl)
new.Data$Lvl[new.Data$Lvl=='0']<-"New Partner"
new.Data$Lvl[new.Data$Lvl=='1']<-"NP Training"
new.Data$Lvl[new.Data$Lvl=='2']<-"NP Trial"
new.Data$Lvl[new.Data$Lvl=='3']<-"NP Test period"
new.Data$Lvl[new.Data$Lvl=='7']<-"Freshman"
new.Data$Lvl[new.Data$Lvl=='11']<-"Verified Memeber"
cLVL<-new.Data[!duplicated(new.Data$Lvl),]
print(cLVL)
## Corp Team
## 2 AER ArgentinaCPI
## 237 causaldata credit_cards
## 244 causaldata Mroz
## 250 causaldata restaurant_inspections
## 1258 openintro ucla_f18
## 1694 tidyr billboard
## Project R C NB NC NF
## 2 Consumer Price Index in Argentina 80 2 0 0 0
## 237 Data on Taiwanese Credit Card Holders 30000 4 2 0 0
## 244 U.S. Women's Labor-Force Participation 753 8 3 0 0
## 250 Data on Restaurant Inspections 27178 5 1 1 0
## 1258 UCLA courses in Fall 2018 3950 14 7 5 0
## 1694 Song rankings for Billboard top 100 in the year 2000 317 79 10 2 0
## Lvl
## 2 New Partner
## 237 NP Trial
## 244 NP Test period
## 250 NP Training
## 1258 Freshman
## 1694 Verified Memeber
write.csv(new.Data,file="HW_TeamSelection.csv",row.names = FALSE)