attach(mtcars)
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
#- average horsepower grouped by the number of cylinders
dataAggregation1 <- aggregate(mtcars$hp, by=list(cyl), FUN=mean, na.rm=TRUE)
dataAggregation1
## Group.1 x
## 1 4 82.64
## 2 6 122.29
## 3 8 209.21
#- maximum weight per number of carburators
dataAggregation2 <- aggregate(mtcars$wt, by=list(carb), FUN=max, na.rm=TRUE)
dataAggregation2
## Group.1 x
## 1 1 3.460
## 2 2 3.845
## 3 3 4.070
## 4 4 5.424
## 5 6 2.770
## 6 8 3.570
detach(mtcars)
file <- "http://data.upf.edu/storage/f/2013-06-13T13%3A56%3A38.399Z/tc-50-campus.csv"
data <- read.table(file, header=TRUE, sep=",", quote="")
str(data)
## 'data.frame': 120 obs. of 4 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ codi : int 401 402 403 404 405 406 407 408 409 410 ...
## $ universitat: Factor w/ 12 levels "UAB","UAO","UB",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ descripcio : Factor w/ 114 levels "Alimentación de Torribera",..: 91 90 86 95 98 1 41 76 25 10 ...
tail(data)
## id codi universitat
## 115 115 6002 Uvic
## 116 116 6003 Uvic
## 117 117 6004 Uvic
## 118 118 6201 UIC
## 119 119 6202 UIC
## 120 120 7001 UAO
## descripcio
## 115 Miramarges
## 116 Campus del centro 'BAU. Escuela Superior de Diseño'
## 117 Campus del centro 'EADA. Escuela de Alta Dirección y Administración'
## 118 Barcelona
## 119 Sant Cugat del Vallès
## 120 Bellesguard
summary(data)
## id codi universitat
## Min. : 1.0 Min. : 401 UAB :34
## 1st Qu.: 30.8 1st Qu.:2218 UPC :15
## Median : 60.5 Median :2414 URV :14
## Mean : 60.5 Mean :3125 UB :13
## 3rd Qu.: 90.2 3rd Qu.:4212 UdG :11
## Max. :120.0 Max. :7001 UPF :11
## (Other):22
## descripcio
## Barcelona : 3
## Campus del centro 'Barcelona Graduate School of Economics' : 2
## Campus del centro 'Instituto Nacional de Educación Física de Cataluña': 2
## Sabadell : 2
## Sant Cugat del Vallès : 2
## Alimentación de Torribera : 1
## (Other) :108
# Metadata for all the all the columns
setwd("/Users/lauracozma/Desktop/Brushups/R")
RRHH09Widths <- c(6, 2, 4, 2, 4, 1, 4, 4, 4, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 3, 1, 1, 1, 4, 2, 2, 2, 4, 4, 1, 1, 2, 2, 1, 2, 1, 1, 2, 4, 2, 4, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 2, 2, 2, 2, 2, 1, 1, 1, 1, 11)
RRHH09Names <- c('MUIDENT', 'CCAARESI', 'ANONAC', 'CCAANAC', 'CONTNACIM', 'RELA', 'CONTNAC1', 'CONTNAC2', 'CONTNAC3', 'SEXO', 'ESTADOCIVIL', 'DEPEN5', 'DEPEN18', 'DEPENMAS', 'NIVESTPA', 'NIVESTMA', 'NIVPROFPA', 'NIVPROFMA', 'ACTIPRIPA1', 'ACTIPRIMA1', 'EDUPRIM', 'EDUSECUN', 'BACHILLER', 'RES', 'CARES1', 'MESRES1', 'CARES2', 'MESRES2', 'CARES3', 'MESRES3', 'DOCTOR', 'CODIDOC', 'INFUN', 'INAPL', 'DESTEC', 'ANODOC', 'MESDOC', 'DURDOCANO', 'DURDOCMES', 'CONTEST', 'ANOEST', 'FINADOC', 'TRABDOC', 'ANOSINTRA', 'MESSINTRA', 'SITLAB', 'HTRAB', 'INGRESOS', 'AUTON', 'CNAE2', 'ANOINIALP', 'CAEMP', 'CONTEMP', 'SECTOR', 'CODALP', 'CATEGPROF', 'HLECT', 'TIPOCONT', 'JORLAB', 'BUSCATRAB', 'NIVELMIN', 'NIVELEST', 'RELTRABDOC', 'NIVELSAT1', 'NIVELSAT2', 'NIVELSAT3', 'NIVELSAT4', 'NIVELSAT5', 'NIVELSAT6', 'NIVELSAT7', 'NIVELSAT8', 'NIVELSAT9', 'NIVELSAT10', 'NIVELSAT11', 'NIVELSAT12', 'NIVELSAT13', 'ACTIANT', 'SECTORANT', 'ACTIPRIANT', 'ACTIANTANO', 'ACTIANTMES', 'TRABPOS', 'MOTIVOPOS1', 'MOTIVOPOS2', 'MOTIVOPOS3', 'MOTIVOPOS4', 'MOTIVOPOS5', 'MOTIVOPOS6', 'MOTIVOPOS7', 'MOTIVOPOS8', 'PORCINVES', 'PORCDOC', 'PORCOTRAS', 'FINANPOS', 'GESBUSEMP', 'PASOSEMP1', 'PASOSEMP2', 'PASOSEMP3', 'PASOSEMP4', 'PASOSEMP5', 'PASOSEMP6', 'PASOSEMP7', 'PASOSEMP8', 'PASOSEMP9', 'PASOSEMP10', 'PASOSEMP11', 'NOPASOSEMP1', 'NOPASOSEMP2', 'NOPASOSEMP3', 'NOPASOSEMP4', 'NOPASOSEMP5', 'NOPASOSEMP6', 'NOPASOSEMP7', 'NOPASOSEMP8', 'NOPASOSEMP9', 'CONT1', 'ANOIPAIS1', 'ANOFPAIS1', 'MESES1', 'CONT2', 'ANOIPAIS2', 'ANOFPAIS2', 'MESES2', 'CONT3', 'ANOIPAIS3', 'ANOFPAIS3', 'MESES3', 'CONT4', 'ANOIPAIS4', 'ANOFPAIS4', 'MESES4', 'CONT5', 'ANOIPAIS5', 'ANOFPAIS5', 'MESES5', 'CONT6', 'ANOIPAIS6', 'ANOFPAIS6', 'MESES6', 'CONT7', 'ANOIPAIS7', 'ANOFPAIS7', 'MESES7', 'MOTFUERA1', 'MOTFUERA2', 'MOTFUERA3', 'MOTFUERA4', 'MOTFUERA5', 'MOTFUERA6', 'MOTFUERA7', 'MOTVENIR1', 'MOTVENIR2', 'MOTVENIR3', 'MOTVENIR4', 'MOTVENIR5', 'MOTVENIR6', 'MOTVENIR7', 'PREVIRSE', 'MESANTESIR', 'MOTIRSE1', 'MOTIRSE2', 'MOTIRSE3', 'MOTIRSE4', 'MOTIRSE5', 'MOTIRSE6', 'CONTOUT', 'INV', 'MOTNOINV1', 'MOTNOINV2', 'MOTNOINV3', 'MOTNOINV4', 'MOTNOINV5', 'MOTNOINV6', 'MOTNOINV7', 'MOTNOINV8', 'MOTNOINV9', 'ALGINV', 'MOTINV1', 'MOTINV2', 'MOTINV3', 'MOTINV4', 'MOTINV5', 'MOTINV6', 'MOTINV7', 'MOTINV8', 'MOTINV9', 'MESESINV', 'ARTIC', 'LIBROS', 'NUMPAT', 'PATNAC', 'PATEU', 'PATCOOP', 'PATCOM', 'COMPA', 'TUTOR', 'COOPERA', 'INVFUT', 'FACTOR')
# Fixed-width import function
file <- "RRHH09.txt"
fwfDataFrame <- read.fwf(file, n = 1000, widths = RRHH09Widths, col.names = RRHH09Names)
summary(fwfDataFrame$SITLAB)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 1.00 1.00 1.07 1.00 3.00
SITLAB is a variable that takes value:
sitlab <- as.integer(fwfDataFrame$SITLAB)
fwfDataFrame$SITLAB[fwfDataFrame$SITLAB==3] <- NA
fwfDataFrame$SITLAB[fwfDataFrame$SITLAB==2] <- 0
sitlab <- as.integer(fwfDataFrame$SITLAB)
summary(sitlab)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 1.000 1.000 0.983 1.000 1.000 25
As we can notice above, employment rate among respondents is high, very close to 1 (mean=0.98).
# data import & header translation into English
setwd("/Users/lauracozma/Desktop/Brushups/R")
censusBCN <- read.table("MAP_SCENSAL.csv", header = TRUE, sep = ";")
names(censusBCN) <- c("Date", "Men", "CensusDivision", "Women", "AGE_0_14", "AGE_15_A_24", "AGE_25_A_64", "AGE_65_plus", "NATIONALS", "EUCommunity", "Overseas")
str(censusBCN)
## 'data.frame': 1061 obs. of 11 variables:
## $ Date : Factor w/ 1 level "21/10/13": 1 1 1 1 1 1 1 1 1 1 ...
## $ Men : int 780 898 1855 1676 1508 1014 1260 2274 1250 1455 ...
## $ CensusDivision: int 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 ...
## $ Women : int 585 730 1549 1262 1096 771 810 1429 1039 1015 ...
## $ AGE_0_14 : int 139 179 599 363 294 159 223 484 268 331 ...
## $ AGE_15_A_24 : int 113 196 352 278 198 153 199 422 251 292 ...
## $ AGE_25_A_64 : int 885 1016 1969 1948 1708 1247 1478 2370 1449 1584 ...
## $ AGE_65_plus : int 228 237 484 349 404 226 170 427 321 263 ...
## $ NATIONALS : int 726 734 1811 1640 1356 791 922 1498 1151 971 ...
## $ EUCommunity : int 145 182 207 245 293 223 218 215 168 158 ...
## $ Overseas : int 494 712 1386 1053 955 771 930 1990 970 1341 ...
# Computing a new variable: number of women between 25-64 years old
censusBCN$percentWomen <- censusBCN$Women/(censusBCN$Men + censusBCN$Women)
censusBCN$WomenAge_25_64 <- censusBCN$percentWomen * censusBCN$AGE_25_A_64
summary(censusBCN$WomenAge_25_64)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 124 378 442 461 516 2090
1)
# 1. Stripchart,histogram and kernel density estimates of WomenAge_25_64
stripchart(censusBCN$WomenAge_25_64,
method="stack",
pch=3,
main="Women aged 25-64",
col="Red",
offset=0.5,
xlab="Women")
hist(censusBCN$WomenAge_25_64,
col=rainbow(20),
main="Histogram: Number of Women between 25-64 years old",
xlab="Women")
density <- density(censusBCN$WomenAge_25_64)
plot(density, main = "")
# histogram with fitted density curve
hist(censusBCN$WomenAge_25_64, prob=T, main="Histogram: Number of Women between 25-64 years old, bw=.5")
lines(density(censusBCN$WomenAge_25_64, bw=100),
col="red",
lwd=3)
2)
# 2. scatterplot 2 vars and add a linear regression line
plot(x = censusBCN$AGE_25_A_64, y = censusBCN$Women, ylab = "Women", xlab = "Age 25-64", pch = 1, cex=0.1)
# Linear regression line
abline(lm(censusBCN$Women ~ censusBCN$AGE_25_A_64), col = "red")
# Loess regression
lw1 <- loess(censusBCN$Women ~ censusBCN$AGE_25_A_64)
plot(loess(censusBCN$Women ~ censusBCN$AGE_25_A_64),pch=1,cex=0.1,ylab = "Women", xlab = "Age 25-64")
j <- order(censusBCN$AGE_25_A_64)
lines(censusBCN$AGE_25_A_64[j],lw1$fitted[j],col="blue",lwd=2)
3)
# 3. use the layout function to display all the univariate plots in a single matrix of plots
layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE),
widths=c(2,2), heights=c(1,2))
stripchart(censusBCN$WomenAge_25_64,
method="stack",
pch=3,
main="Women aged 25-64",
col="Red",
offset=0.5,
xlab="Women")
hist(censusBCN$WomenAge_25_64,
col=rainbow(20),
main="",
xlab="Women")
plot(density, main = "")
agingPopulation <- read.csv("aging-population-2008.csv", header = TRUE, sep = ",", quote = "\"", stringsAsFactors=FALSE)
str(agingPopulation)
## 'data.frame': 333 obs. of 7 variables:
## $ SOA.Code : chr "E01031175" "E01031204" "E01031104" "E01031135" ...
## $ SOA.Name : chr "Overslade South East" "Wootton Wawen" "St Nicolas North & College" "Bilton South Cock Robin" ...
## $ Ward.Name : chr "Overslade" "Henley" "St. Nicolas" "Bilton" ...
## $ Locality : chr "Rugby Town West" "Alcester, Studley & Henley" "Weddington & St. Nicolas" "Rugby Town West " ...
## $ District.Borough : chr "Rugby" "Stratford-on-Avon" "Nuneaton & Bedworth" "Rugby" ...
## $ Total.Population : chr "1,335" "1,286" "1,453" "1,588" ...
## $ X..of.Total.Population.aged.60...Females...65...Males.: chr "41.7%" "41.1%" "40.0%" "39.1%" ...
# Total Population
agingPopulation$TotalPopulation <- as.numeric(gsub('![[:alnum:]]*[[:space:]]|[[:punct:]]', '', agingPopulation$Total.Population))
# % of Total Population aged 60+ (Females), 65+ (Males)
#replace the % from the string variable
agingPopulation$PercentTotal.Population.aged.60Females..65Males <- as.numeric(gsub("%", "", cat(agingPopulation$X..of.Total.Population.aged.60...Females...65...Males.)))
## 41.7% 41.1% 40.0% 39.1% 38.7% 37.2% 36.3% 36.2% 35.6% 34.9% 34.9% 33.7% 33.5% 33.4% 32.4% 32.4% 32.2% 32.0% 31.9% 31.8% 31.8% 31.8% 31.6% 31.4% 31.3% 31.2% 31.2% 31.0% 30.7% 30.3% 29.8% 29.8% 29.7% 29.7% 29.7% 29.6% 29.4% 29.3% 29.3% 29.3% 29.0% 28.9% 28.9% 28.9% 28.9% 28.8% 28.8% 28.7% 28.5% 28.4% 28.1% 27.7% 27.7% 27.7% 27.4% 27.3% 27.2% 27.2% 27.1% 27.0% 27.0% 27.0% 26.8% 26.7% 26.6% 26.5% 26.5% 26.4% 26.2% 26.1% 26.1% 26.0% 25.9% 25.9% 25.9% 25.9% 25.8% 25.8% 25.7% 25.6% 25.5% 25.5% 25.4% 25.4% 25.4% 25.3% 25.2% 25.2% 25.2% 25.1% 25.1% 25.0% 25.0% 24.9% 24.8% 24.7% 24.6% 24.6% 24.5% 24.5% 24.5% 24.5% 24.4% 24.4% 24.3% 24.2% 24.1% 24.1% 24.0% 24.0% 23.9% 23.8% 23.7% 23.7% 23.7% 23.6% 23.6% 23.6% 23.5% 23.5% 23.5% 23.4% 23.0% 22.9% 22.9% 22.7% 22.7% 22.6% 22.5% 22.5% 22.4% 22.4% 22.4% 22.4% 22.3% 22.3% 22.3% 22.3% 22.2% 22.2% 22.1% 22.0% 22.0% 21.9% 21.9% 21.9% 21.8% 21.7% 21.7% 21.7% 21.7% 21.7% 21.7% 21.6% 21.6% 21.6% 21.5% 21.5% 21.4% 21.4% 21.3% 21.3% 21.3% 21.2% 21.1% 21.1% 21.0% 21.0% 20.9% 20.8% 20.8% 20.7% 20.7% 20.7% 20.5% 20.5% 20.4% 20.4% 20.4% 20.2% 20.2% 20.1% 20.0% 20.0% 19.9% 19.7% 19.6% 19.5% 19.5% 19.4% 19.3% 19.3% 19.2% 19.2% 19.1% 19.0% 19.0% 19.0% 19.0% 19.0% 19.0% 18.9% 18.8% 18.7% 18.7% 18.6% 18.6% 18.6% 18.5% 18.5% 18.5% 18.4% 18.3% 18.3% 18.2% 18.2% 18.2% 18.2% 18.1% 18.0% 17.9% 17.8% 17.7% 17.6% 17.6% 17.6% 17.6% 17.5% 17.5% 17.4% 17.4% 17.3% 17.3% 17.1% 17.1% 17.1% 17.1% 17.0% 17.0% 17.0% 17.0% 16.9% 16.9% 16.8% 16.5% 16.4% 16.4% 16.3% 16.3% 16.2% 16.1% 16.0% 16.0% 16.0% 15.9% 15.9% 15.9% 15.8% 15.8% 15.8% 15.8% 15.7% 15.5% 15.4% 15.3% 15.1% 15.1% 15.1% 15.0% 15.0% 15.0% 14.8% 14.7% 14.4% 14.3% 14.3% 14.3% 14.3% 14.2% 14.2% 14.2% 14.2% 14.1% 14.1% 14.0% 13.6% 13.4% 13.4% 13.3% 13.3% 13.3% 12.9% 12.8% 12.7% 12.7% 12.7% 12.6% 12.2% 12.1% 12.0% 11.8% 11.8% 11.7% 11.7% 11.7% 11.4% 11.3% 11.3% 11.1% 11.0% 10.8% 10.7% 10.6% 10.6% 10.5% 10.4% 10.0% 10.0% 10.0% 9.8% 9.7% 9.4% 9.0% 8.6% 8.2% 7.9% 7.9% 7.5% 7.0% 6.7% 5.9% 5.6% 5.4%
## Error: replacement has 0 rows, data has 333
str(agingPopulation)
## 'data.frame': 333 obs. of 8 variables:
## $ SOA.Code : chr "E01031175" "E01031204" "E01031104" "E01031135" ...
## $ SOA.Name : chr "Overslade South East" "Wootton Wawen" "St Nicolas North & College" "Bilton South Cock Robin" ...
## $ Ward.Name : chr "Overslade" "Henley" "St. Nicolas" "Bilton" ...
## $ Locality : chr "Rugby Town West" "Alcester, Studley & Henley" "Weddington & St. Nicolas" "Rugby Town West " ...
## $ District.Borough : chr "Rugby" "Stratford-on-Avon" "Nuneaton & Bedworth" "Rugby" ...
## $ Total.Population : chr "1,335" "1,286" "1,453" "1,588" ...
## $ X..of.Total.Population.aged.60...Females...65...Males.: chr "41.7%" "41.1%" "40.0%" "39.1%" ...
## $ TotalPopulation : num 1335 1286 1453 1588 1487 ...
First, when I have imported the csv file, R sees string variables as factors. In order to avoid this i have added the
censusBCN <- read.table("MAP_SCENSAL.csv", header = TRUE, sep = ";")
names(censusBCN) <- c("Date", "Men", "CensusDivision", "Women", "AGE_0_14", "AGE_15_A_24", "AGE_25_A_64", "AGE_65_plus", "NATIONALS", "EUCommunity", "Overseas")
censusBCN$Date<-strftime(strptime(censusBCN$Date,f="%d/%m/%y"),f="%d/%m/%Y")
censusBCN$TheDayAfter <- as.Date(censusBCN$Date, f="%d/%m/%Y")+1
str(censusBCN)
## 'data.frame': 1061 obs. of 12 variables:
## $ Date : chr "21/10/2013" "21/10/2013" "21/10/2013" "21/10/2013" ...
## $ Men : int 780 898 1855 1676 1508 1014 1260 2274 1250 1455 ...
## $ CensusDivision: int 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 ...
## $ Women : int 585 730 1549 1262 1096 771 810 1429 1039 1015 ...
## $ AGE_0_14 : int 139 179 599 363 294 159 223 484 268 331 ...
## $ AGE_15_A_24 : int 113 196 352 278 198 153 199 422 251 292 ...
## $ AGE_25_A_64 : int 885 1016 1969 1948 1708 1247 1478 2370 1449 1584 ...
## $ AGE_65_plus : int 228 237 484 349 404 226 170 427 321 263 ...
## $ NATIONALS : int 726 734 1811 1640 1356 791 922 1498 1151 971 ...
## $ EUCommunity : int 145 182 207 245 293 223 218 215 168 158 ...
## $ Overseas : int 494 712 1386 1053 955 771 930 1990 970 1341 ...
## $ TheDayAfter : Date, format: "2013-10-22" "2013-10-22" ...