Day 2 of the RI workshop, Summer 2024
FSU
group_by()summarize()library(tidyverse)
#load in the data
read_csv('class_toy_data.csv') -> data
#this is an easier way to accomplish what we need
#read_csv('class_toy_data.csv', skip = 1) -> data
#using rename instead
## first we remove the unnecessary row and variables
## then we use rename
data %>%
slice(-1) %>%
select(-c(x1,x11,x12,x13,x14)) %>%
rename('age' = x2,
'gender' = x3,
'educ' = x4,
'income' = x5,
'app_rat' = x6,
'pid' = x7,
'opp_tex' = x8,
'gun_opp' = x9,
'abo_opp' = x10) -> data
# now lets get the data formatted properly
data %>%
mutate(age = parse_number(age),
income = parse_number(income),
app_rat = parse_number(app_rat),
gun_opp = parse_number(gun_opp),
abo_opp = parse_number(abo_opp)) -> data
# changing the negative value from approval to NA
data %>%
mutate(app_rat = case_when(app_rat < 0 ~ NA_real_,
TRUE ~ app_rat)) -> data
# keeping only republicans
data %>%
filter(pid == 'Republican') -> data
# saving the data
#write_csv(data, 'cutler_hw1_data')
# note that this could all technically be done in one big chunk
data %>%
slice(-1) %>%
select(-c(x1,x11,x12,x13,x14)) %>%
rename('age' = x2,
'gender' = x3,
'educ' = x4,
'income' = x5,
'app_rat' = x6,
'pid' = x7,
'opp_tex' = x8,
'gun_opp' = x9,
'abo_opp' = x10) %>%
mutate(age = parse_number(age),
income = parse_number(income),
app_rat = parse_number(app_rat),
gun_opp = parse_number(gun_opp),
abo_opp = parse_number(abo_opp),
app_rat = case_when(app_rat < 0 ~ NA_real_,
TRUE ~ app_rat)) %>%
filter(pid == 'Republican')
#alternative ways to accomplish getting the data in the right format:
read_csv('class_toy_data.csv', skip = 1)
#or if we hadn't done that
data %>%
mutate(across(c(age,income,app_rat,gun_opp,abo_opp), ~parse_number(.)))
#or we could even just
data %>%
mutate(across(where(is.numeric), parse_number))What are the types of data we can encounter?
Show Answer
Numeric, string/character, factor, logical
What is an object in R and where are they shown?
Show Answer
Created data, can be either a vector or data frame; global environment
What is this %>% called?
Show Answer
A “Pipe”
What functions did we learn about last class to clean data?
Show Answer
mutate(), select(), slice(), filter(), parse_number()
What functions can be used to rename variables?
Show Answer
rename(), clean_names()
| Data Format | Function |
|---|---|
.rds |
readRDS() |
.RDATA |
load() |
.txt' |
read.delim() |
.dta |
read_dta() |
.sav |
read_sav() |
read_dta() and read_sav() come from the haven packagemutate(), select(), and filter()case_when() is very useful for shaping data when combined with mutate()library(tidyverse)
tibble("Full Name" = c("John Smith", "Jimmy Dean", "Robert Williams",
"Emily Davis", "Michael Brown"),
"Political Affiliation" = c("Democratic", "Republican", NA, "Democratic",
"Libertarian"),
"Represented State" = c("California", "Texas", "New York", NA, "Florida"),
"Politician Age" = c(45, 65, 60, 41, 20),
"Years Served" = c(6, NA, 2, 4, 12),
"Votes Received" = c(24000, NA, 15000, 20000, 32000),
"Legislation Passed" = c(12, 10, NA, 6, 15)) -> real_congress
real_congress# A tibble: 5 × 7
`Full Name` `Political Affiliation` `Represented State` `Politician Age`
<chr> <chr> <chr> <dbl>
1 John Smith Democratic California 45
2 Jimmy Dean Republican Texas 65
3 Robert Williams <NA> New York 60
4 Emily Davis Democratic <NA> 41
5 Michael Brown Libertarian Florida 20
# ℹ 3 more variables: `Years Served` <dbl>, `Votes Received` <dbl>,
# `Legislation Passed` <dbl>
real_congress %>%
janitor::clean_names() %>%
rename('party' = political_affiliation,
'state' = represented_state,
'age' = politician_age) %>%
filter(!is.na(party)) %>%
mutate(age_cat = case_when(age < 30 ~ "<30",
(age >= 30 & age < 60) ~ "30-60",
age >= 60 ~ "60+")) %>%
select(full_name,party,state,age_cat) -> real_congress_2
real_congress_2# A tibble: 4 × 4
full_name party state age_cat
<chr> <chr> <chr> <chr>
1 John Smith Democratic California 30-60
2 Jimmy Dean Republican Texas 60+
3 Emily Davis Democratic <NA> 30-60
4 Michael Brown Libertarian Florida <30
Now that we’ve covered the basics again, let’s go through a problem together, download the Live Coding 1 data on the course materials page
As a group, we are going to:
group_by() and summarize() to accomplish thisgroup_by works similarly to row_wise() from the homework, let’s start therereal_congress %>%
janitor::clean_names() %>%
rename('party' = political_affiliation,
'state' = represented_state,
'age' = politician_age) %>%
filter(!is.na(party)) -> real_congress
real_congress# A tibble: 4 × 7
full_name party state age years_served votes_received legislation_passed
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 John Smith Demo… Cali… 45 6 24000 12
2 Jimmy Dean Repu… Texas 65 NA NA 10
3 Emily Davis Demo… <NA> 41 4 20000 6
4 Michael Brown Libe… Flor… 20 12 32000 15
# A tibble: 3 × 2
party sample
<chr> <int>
1 Democratic 2
2 Libertarian 1
3 Republican 1
real_congress %>%
mutate(age_cat = case_when(age < 30 ~ "<30",
(age >= 30 & age < 60) ~ "30-60",
age >= 60 ~ "60+")) %>%
group_by(age_cat) %>%
summarize(leg_pro = mean(legislation_passed))# A tibble: 3 × 2
age_cat leg_pro
<chr> <dbl>
1 30-60 9
2 60+ 10
3 <30 15
Show Answer
Change the variable from character to factor
real_congress %>%
mutate(age_cat = case_when(age < 30 ~ "<30",
(age >= 30 & age < 60) ~ "30-60",
age >= 60 ~ "60+"),
age_cat = factor(age_cat,
levels = c('<30', '30-60', '60+'))) %>%
group_by(age_cat) %>%
summarize(leg_pro = mean(legislation_passed))# A tibble: 3 × 2
age_cat leg_pro
<fct> <dbl>
1 <30 15
2 30-60 9
3 60+ 10
real_congress %>%
mutate(age_cat = factor(age_cat,
levels = c('<30', '30-60', '60+')),
age_cat = case_when(age < 30 ~ "<30",
(age >= 30 & age < 60) ~ "30-60",
age >= 60 ~ "60+")) %>%
group_by(age_cat) %>%
summarize(leg_pro = mean(legislation_passed))Error in `mutate()`:
ℹ In argument: `age_cat = factor(age_cat, levels = c("<30", "30-60",
"60+"))`.
Caused by error:
! object 'age_cat' not found
age_cat variable isn’t in our data until we make itsummary(), or names()summary() on a full dataset will give you summary statistics for each variable in the data full_name party state age
Length:4 Length:4 Length:4 Min. :20.00
Class :character Class :character Class :character 1st Qu.:35.75
Mode :character Mode :character Mode :character Median :43.00
Mean :42.75
3rd Qu.:50.00
Max. :65.00
years_served votes_received legislation_passed
Min. : 4.000 Min. :20000 Min. : 6.00
1st Qu.: 5.000 1st Qu.:22000 1st Qu.: 9.00
Median : 6.000 Median :24000 Median :11.00
Mean : 7.333 Mean :25333 Mean :10.75
3rd Qu.: 9.000 3rd Qu.:28000 3rd Qu.:12.75
Max. :12.000 Max. :32000 Max. :15.00
NA's :1 NA's :1
names() function[1] "full_name" "party" "state"
[4] "age" "years_served" "votes_received"
[7] "legislation_passed"
nrow() and ncol() functions$ to pull a single variable from a dataset: Min. 1st Qu. Median Mean 3rd Qu. Max.
20.00 35.75 43.00 42.75 50.00 65.00
table() functionlength() function:bind_rows() and/or bind_cols() depending on our application (bind_rows() only works with dataframes in this context)bind_rows() or bind_cols()# A tibble: 12 × 3
x y z
<dbl> <dbl> <chr>
1 1 1 <NA>
2 2 1 <NA>
3 3 1 <NA>
4 4 1 <NA>
5 100 2 <NA>
6 100 2 <NA>
7 100 2 <NA>
8 2 2 <NA>
9 4 NA alpha
10 5 NA delta
11 6 NA bravo
12 7 NA echo