Membuat Vector
x = c(1,2,3,4,5)
x = x +1
x
## [1] 2 3 4 5 6
Membuat Vector String
height = c(75, 74, 67, 83, 75)
name = c("Alex Ovechkin", "Mike Trout", "Lionel Messi",
"Giannis Antetokounmpo", "Patrick Mahomes")
Membuat Data Frame
atlet = data.frame(name, height)
print(atlet)
## name height
## 1 Alex Ovechkin 75
## 2 Mike Trout 74
## 3 Lionel Messi 67
## 4 Giannis Antetokounmpo 83
## 5 Patrick Mahomes 75
Contoh Kasus Consider the following set of attributes about the American Film Institute’s topfive movies ever from their 2007 list.
Movie = c("Citizen Kane", "The Godfather","Casablanca", "Raging Bull", "Singing in the Rain")
Year = c(1941, 1972, 1942, 1980,1952)
RunTime = c(119, 177,102,129,103)
RunTimeHours = RunTime/60
MovieInfo = data.frame(Movie,Year,RunTime)
Consider the following set of attributes about a series of LucasArts—an early video game company under the umbrella of George Lucas’s Lucasfilm company—video games.
Title = c("The Secret of Monkey Island","Indiana Jones, and the Fate of Atlantis", "Day of the Tentacole", "Grim Fandango")
Release = c( 1990, 1992, 1993, 1998)
Release - 1982
## [1] 8 10 11 16
Rank = c( 14, 11, 6, 1)
AdventureGames = data.frame(Title, Release, Rank)
Random Numbers, and Selecting a Random Sample
Subsetting vector
hours = c(8.84,3.26,2.81,0.64,0.60,0.53,0.37,0.35,0.31,0.24)
hours[1]
## [1] 8.84
hours[c(1,3,9)]
## [1] 8.84 2.81 0.31
hours[hours>=0.5 & hours <= 0.75]
## [1] 0.64 0.60 0.53
hours[hours<0.25|hours>4]
## [1] 8.84 0.24
Subsetting data frames
Name = c("Sleeping", "Working", "Watching Television", "Socializing", "Food Preparation", "Housework", "Childcare", "Consumer Goods Purchase", "Participating in Recreation", "Attending Class")
AverageHours = c(8.84, 3.26, 2.81, 0.64, 0.60, 0.53, 0.37, 0.35, 0.31, 0.24)
Category = c("Personal Care", "Work-Related", "Leisure", "Leisure", "Household", "Household", "Caring for Household", "Purchasing", "Leisure", "Education")
Activities = data.frame(Name,AverageHours,Category)
Memanggil data
Activities[5,3]
## [1] "Household"
Activities[3,1]
## [1] "Watching Television"
Activities[7,2]
## [1] 0.37
Memanggil data perbaris
Activities[5,]
## Name AverageHours Category
## 5 Food Preparation 0.6 Household
Activities[c(1,4,7),]
## Name AverageHours Category
## 1 Sleeping 8.84 Personal Care
## 4 Socializing 0.64 Leisure
## 7 Childcare 0.37 Caring for Household
Memanggil data perkolom
Activities$Category
## [1] "Personal Care" "Work-Related" "Leisure"
## [4] "Leisure" "Household" "Household"
## [7] "Caring for Household" "Purchasing" "Leisure"
## [10] "Education"
Menyeleksi data dengan operator logika
Activities[Activities$AverageHours > 1]
## Name AverageHours Category
## 1 Sleeping 8.84 Personal Care
## 2 Working 3.26 Work-Related
## 3 Watching Television 2.81 Leisure
## 4 Socializing 0.64 Leisure
## 5 Food Preparation 0.60 Household
## 6 Housework 0.53 Household
## 7 Childcare 0.37 Caring for Household
## 8 Consumer Goods Purchase 0.35 Purchasing
## 9 Participating in Recreation 0.31 Leisure
## 10 Attending Class 0.24 Education
Random numbers in R
set.seed(8)
rnorm(10)
## [1] -0.08458607 0.84040013 -0.46348277 -0.55083500 0.73604043 -0.10788140
## [7] -0.17028915 -1.08833171 -3.01105168 -0.59317433
Select a random sample
sample.int(n=number of subject in sampling)
Contoh Kasus Suppose we have the following data frame named Colleges:
Colleges = data.frame(
College = c("William and Mary", "Christopher Newport", "George Mason", "James Madison",
"Longwood", "Norfolk State", "Old Dominion", "Radford", "Mary Washington",
"Virginia", "Virginia Commonwealth", "Virginia Military Institute",
"Virginia Tech", "Virginia State"),
Employees = c(2104, 922, 4043, 2833, 746, 919, 2369, 1273, 721, 7431, 5825, 550, 7303, 761),
TopSalary = c(425000, 381486, 536714, 428400, 328268, 295000, 448272, 312080, 449865, 561099, 503154, 364269, 500000, 356524),
MedianSalary = c(56496, 47895, 63029, 53080, 52000, 49605, 54416, 51000, 53045, 60048, 55000, 44999, 51656, 55925))
Colleges$TopSalary[c(1,3,10,12)]
## [1] 425000 536714 561099 364269
Colleges$MedianSalary[Colleges$TopSalary > 400000]
## [1] 56496 63029 53080 54416 53045 60048 55000 51656
Colleges[Colleges$Employees <= 1000,]
## College Employees TopSalary MedianSalary
## 2 Christopher Newport 922 381486 47895
## 5 Longwood 746 328268 52000
## 6 Norfolk State 919 295000 49605
## 9 Mary Washington 721 449865 53045
## 12 Virginia Military Institute 550 364269 44999
## 14 Virginia State 761 356524 55925
Colleges[1:5,]
## College Employees TopSalary MedianSalary
## 1 William and Mary 2104 425000 56496
## 2 Christopher Newport 922 381486 47895
## 3 George Mason 4043 536714 63029
## 4 James Madison 2833 428400 53080
## 5 Longwood 746 328268 52000
Suppose we have the following data frame named Countries:
Countries <- data.frame(
Nation = c("China", "India", "United States", "Indonesia", "Brazil", "Pakistan",
"Nigeria", "Bangladesh", "Russia", "Mexico"),
Region = c("Asia", "Asia", "North America", "Asia", "South America", "Asia",
"Africa", "Asia", "Europe", "North America"),
Population = c(1409517397, 1339180127, 324459463, 263991379, 209288278,
197015955, 190886311, 164669751, 143989754, 129163276),
PctIncrease = c(0.40, 1.10, 0.70, 1.10, 0.80, 2.00, 2.60, 1.10, 0.00, 1.30),
GDPcapita = c(8582, 1852, 57467, 3895, 10309, 1629, 2640, 1524, 10248, 8562)
)
Countries[Countries$GDPcapita < 10000 & Countries$Region != "Asia",]
## Nation Region Population PctIncrease GDPcapita
## 7 Nigeria Africa 190886311 2.6 2640
## 10 Mexico North America 129163276 1.3 8562
Countries$Nation[1:3]
## [1] "China" "India" "United States"
Countries$Nation[Countries$PctIncrease > 1.5]
## [1] "Pakistan" "Nigeria"
Suppose we have the following data frame named Olympics:
Olympics <- data.frame(
Year = c(1992, 1992, 1994, 1996, 1998, 2000, 2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018),
Type = c("Summer", "Winter", "Winter", "Summer", "Winter", "Summer", "Winter", "Summer", "Winter", "Summer",
"Winter", "Summer", "Winter", "Summer", "Winter"),
Host = c("Spain", "France", "Norway", "United States", "Japan", "Australia", "United States", "Greece", "Italy",
"China", "Canada", "United Kingdom", "Russia", "Brazil", "South Korea"),
Competitors = c(9356, 1801, 1737, 10318, 2176, 10651, 2399, 10625, 2508, 10942, 2566, 10768, 2873, 11238, 2922),
Events = c(257, 57, 61, 271, 68, 300, 78, 301, 84, 302, 86, 302, 98, 306, 102),
Nations = c(169, 64, 67, 197, 72, 199, 78, 201, 80, 204, 82, 204, 88, 207, 92),
Leader = c("Unified Team", "Germany", "Russia", "United States", "Germany", "United States", "Norway",
"United States", "Germany", "China", "Canada", "United States", "Russia", "United States", "Norway")
)
Olympics[Olympics$Host == Olympics$Leader,]
## Year Type Host Competitors Events Nations Leader
## 4 1996 Summer United States 10318 271 197 United States
## 10 2008 Summer China 10942 302 204 China
## 11 2010 Winter Canada 2566 86 82 Canada
## 13 2014 Winter Russia 2873 98 88 Russia
Olympics[(Olympics$Competitors / Olympics$Events) > 35,]
## Year Type Host Competitors Events Nations Leader
## 1 1992 Summer Spain 9356 257 169 Unified Team
## 4 1996 Summer United States 10318 271 197 United States
## 6 2000 Summer Australia 10651 300 199 United States
## 8 2004 Summer Greece 10625 301 201 United States
## 10 2008 Summer China 10942 302 204 China
## 12 2012 Summer United Kingdom 10768 302 204 United States
## 14 2016 Summer Brazil 11238 306 207 United States
Olympics[Olympics$Nations >= 80,]
## Year Type Host Competitors Events Nations Leader
## 1 1992 Summer Spain 9356 257 169 Unified Team
## 4 1996 Summer United States 10318 271 197 United States
## 6 2000 Summer Australia 10651 300 199 United States
## 8 2004 Summer Greece 10625 301 201 United States
## 9 2006 Winter Italy 2508 84 80 Germany
## 10 2008 Summer China 10942 302 204 China
## 11 2010 Winter Canada 2566 86 82 Canada
## 12 2012 Summer United Kingdom 10768 302 204 United States
## 13 2014 Winter Russia 2873 98 88 Russia
## 14 2016 Summer Brazil 11238 306 207 United States
## 15 2018 Winter South Korea 2922 102 92 Norway