Binding Data Frames

This lesson is called Binding Data Frames, part of the Going Deeper with R course. This lesson is called Binding Data Frames, part of the Going Deeper with R course.

Transcript

Click on the transcript to go to that point in the video. Please note that transcripts are auto generated and may contain minor inaccuracies.

View code shown in video

# Load Packages -----------------------------------------------------------

library(tidyverse)
library(fs)
library(readxl)
library(janitor)

# Create Directories ------------------------------------------------------

dir_create("data-raw")

# Download Data -----------------------------------------------------------

# https://www.oregon.gov/ode/educator-resources/assessment/Pages/Assessment-Group-Reports.aspx

# download.file("https://www.oregon.gov/ode/educator-resources/assessment/Documents/TestResults2122/pagr_schools_math_tot_raceethnicity_2122.xlsx",
#               mode = "wb",
#               destfile = "data-raw/pagr_schools_math_tot_raceethnicity_2122.xlsx")
# 
# download.file("https://www.oregon.gov/ode/educator-resources/assessment/Documents/TestResults2122/TestResults2019/pagr_schools_math_tot_raceethnicity_1819.xlsx",
#               mode = "wb",
#               destfile = "data-raw/pagr_schools_math_tot_raceethnicity_1819.xlsx")
# 
# download.file("https://www.oregon.gov/ode/educator-resources/assessment/TestResults2018/pagr_schools_math_raceethnicity_1718.xlsx",
#               mode = "wb",
#               destfile = "data-raw/pagr_schools_math_raceethnicity_1718.xlsx")
# 
# download.file("https://www.oregon.gov/ode/educator-resources/assessment/TestResults2017/pagr_schools_math_raceethnicity_1617.xlsx",
#               mode = "wb",
#               destfile = "data-raw/pagr_schools_math_raceethnicity_1617.xlsx")
# 
# download.file("https://www.oregon.gov/ode/educator-resources/assessment/TestResults2016/pagr_schools_math_raceethnicity_1516.xlsx",
#               mode = "wb",
#               destfile = "data-raw/pagr_schools_math_raceethnicity_1516.xlsx")


# Import Data -------------------------------------------------------------

math_scores_2021_2022 <-
  read_excel(path = "data-raw/pagr_schools_math_tot_raceethnicity_2122.xlsx") |> 
  clean_names()

math_scores_2018_2019 <-
  read_excel(path = "data-raw/pagr_schools_math_tot_raceethnicity_1819.xlsx") |> 
  clean_names()


# Tidy and Clean Data -----------------------------------------------------

third_grade_math_proficiency_2021_2022 <-
  math_scores_2021_2022 |> 
  filter(student_group == "Total Population (All Students)") |> 
  filter(grade_level == "Grade 3") |> 
  select(academic_year, school_id, contains("number_level")) |> 
  pivot_longer(cols = starts_with("number_level"),
               names_to = "proficiency_level",
               values_to = "number_of_students") |> 
  mutate(proficiency_level = case_when(
    proficiency_level == "number_level_4" ~ "4",
    proficiency_level == "number_level_3" ~ "3",
    proficiency_level == "number_level_2" ~ "2",
    proficiency_level == "number_level_1" ~ "1"
  )) |> 
  mutate(number_of_students = parse_number(number_of_students)) |> 
  group_by(school_id) |> 
  mutate(pct = number_of_students / sum(number_of_students, na.rm = TRUE)) |> 
  ungroup()

third_grade_math_proficiency_2018_2019 <-
  math_scores_2018_2019 |> 
  filter(student_group == "Total Population (All Students)") |> 
  filter(grade_level == "Grade 3") |> 
  select(academic_year, school_id, contains("number_level")) |> 
  pivot_longer(cols = starts_with("number_level"),
               names_to = "proficiency_level",
               values_to = "number_of_students") |> 
  mutate(proficiency_level = case_when(
    proficiency_level == "number_level_4" ~ "4",
    proficiency_level == "number_level_3" ~ "3",
    proficiency_level == "number_level_2" ~ "2",
    proficiency_level == "number_level_1" ~ "1"
  )) |> 
  mutate(number_of_students = parse_number(number_of_students)) |> 
  group_by(school_id) |> 
  mutate(pct = number_of_students / sum(number_of_students, na.rm = TRUE)) |> 
  ungroup()

bind_rows(third_grade_math_proficiency_2018_2019,
          third_grade_math_proficiency_2021_2022) |> 
  view()

third_grade_math_proficiency_2018_2019 |> 
  bind_rows(third_grade_math_proficiency_2021_2022)

Your Turn

Solution

# Load Packages -----------------------------------------------------------

library(tidyverse)
library(fs)
library(readxl)
library(janitor)

# Create Directories ------------------------------------------------------

dir_create("data-raw")

# Download Data -----------------------------------------------------------

# https://www.oregon.gov/ode/reports-and-data/students/Pages/Student-Enrollment-Reports.aspx

# download.file("https://www.oregon.gov/ode/reports-and-data/students/Documents/fallmembershipreport_20222023.xlsx",
#               mode = "wb",
#               destfile = "data-raw/fallmembershipreport_20222023.xlsx")
# 
# download.file("https://www.oregon.gov/ode/reports-and-data/students/Documents/fallmembershipreport_20212022.xlsx",
#               mode = "wb",
#               destfile = "data-raw/fallmembershipreport_20212022.xlsx")
# 
# download.file("https://www.oregon.gov/ode/reports-and-data/students/Documents/fallmembershipreport_20202021.xlsx",
#               mode = "wb",
#               destfile = "data-raw/fallmembershipreport_20202021.xlsx")
# 
# download.file("https://www.oregon.gov/ode/reports-and-data/students/Documents/fallmembershipreport_20192020.xlsx",
#               mode = "wb",
#               destfile = "data-raw/fallmembershipreport_20192020.xlsx")
# 
# download.file("https://www.oregon.gov/ode/reports-and-data/students/Documents/fallmembershipreport_20182019.xlsx",
#               mode = "wb",
#               destfile = "data-raw/fallmembershipreport_20182019.xlsx")

# Import Data -------------------------------------------------------------

enrollment_2022_2023 <- read_excel(path = "data-raw/fallmembershipreport_20222023.xlsx",
                                   sheet = "School 2022-23") |> 
  clean_names()

enrollment_2021_2022 <- read_excel(path = "data-raw/fallmembershipreport_20212022.xlsx",
                                   sheet = "School 2021-22") |> 
  clean_names()

# Tidy and Clean Data -----------------------------------------------------

enrollment_by_race_ethnicity_2022_2023 <-
  enrollment_2022_2023 |> 
  select(district_institution_id, school_institution_id,
         x2022_23_american_indian_alaska_native:x2022_23_multi_racial) |> 
  select(-contains("percent")) |> 
  pivot_longer(cols = -c(district_institution_id, school_institution_id),
               names_to = "race_ethnicity",
               values_to = "number_of_students") |> 
  mutate(race_ethnicity = str_remove(race_ethnicity, pattern = "x2022_23_")) |> 
  mutate(race_ethnicity = case_when(
    race_ethnicity == "american_indian_alaska_native" ~ "American Indian Alaska Native",
    race_ethnicity == "asian" ~ "Asian",
    race_ethnicity == "black_african_american" ~ "Black/African American",
    race_ethnicity == "hispanic_latino" ~ "Hispanic/Latino",
    race_ethnicity == "multiracial" ~ "Multi-Racial",
    race_ethnicity == "native_hawaiian_pacific_islander" ~ "Native Hawaiian Pacific Islander",
    race_ethnicity == "white" ~ "White",
    race_ethnicity == "multi_racial" ~ "Multiracial"
  )) |> 
  mutate(number_of_students = parse_number(number_of_students)) |> 
  group_by(district_institution_id, race_ethnicity) |> 
  summarize(number_of_students = sum(number_of_students, na.rm = TRUE)) |> 
  ungroup() |> 
  group_by(district_institution_id) |> 
  mutate(pct = number_of_students / sum(number_of_students)) |> 
  ungroup() |> 
  mutate(year = "2022-2023")

enrollment_by_race_ethnicity_2021_2022 <-
  enrollment_2021_2022 |> 
  select(attending_district_institution_id, attending_school_institution_id,
         x2021_22_american_indian_alaska_native:x2021_22_multi_racial) |> 
  rename(district_institution_id = attending_district_institution_id,
         school_institution_id = attending_school_institution_id) |> 
  select(-contains("percent")) |> 
  pivot_longer(cols = -c(district_institution_id, school_institution_id),
               names_to = "race_ethnicity",
               values_to = "number_of_students") |> 
  mutate(race_ethnicity = str_remove(race_ethnicity, pattern = "x2021_22_")) |> 
  mutate(race_ethnicity = case_when(
    race_ethnicity == "american_indian_alaska_native" ~ "American Indian Alaska Native",
    race_ethnicity == "asian" ~ "Asian",
    race_ethnicity == "black_african_american" ~ "Black/African American",
    race_ethnicity == "hispanic_latino" ~ "Hispanic/Latino",
    race_ethnicity == "multiracial" ~ "Multi-Racial",
    race_ethnicity == "native_hawaiian_pacific_islander" ~ "Native Hawaiian Pacific Islander",
    race_ethnicity == "white" ~ "White",
    race_ethnicity == "multi_racial" ~ "Multiracial"
  )) |> 
  mutate(number_of_students = parse_number(number_of_students)) |> 
  group_by(district_institution_id, race_ethnicity) |> 
  summarize(number_of_students = sum(number_of_students, na.rm = TRUE)) |> 
  ungroup() |> 
  group_by(district_institution_id) |> 
  mutate(pct = number_of_students / sum(number_of_students)) |> 
  ungroup() |> 
  mutate(year = "2021-2022")

bind_rows(enrollment_by_race_ethnicity_2021_2022,
          enrollment_by_race_ethnicity_2022_2023)

Import 2021-2022 enrollment data into a data frame called enrollment_by_race_ethnicity_2021_2022 and clean it using the code you used for the 2022-2023 data. You’ll need to change bits of your code to make it work.
Add a year variable to the enrollment_by_race_ethnicity_2021_2022 and enrollment_by_race_ethnicity_2022_2023 data frames.
Use bind_rows() to make a enrollment_by_race_ethnicity data frame by combining the enrollment_by_race_ethnicity_2021_2022 and enrollment_by_race_ethnicity_2022_2023 data frames.

Here is code to get started:

# Load Packages -----------------------------------------------------------

library(tidyverse)
library(fs)
library(readxl)
library(janitor)

# Create Directories ------------------------------------------------------

dir_create("data-raw")

# Download Data -----------------------------------------------------------

# https://www.oregon.gov/ode/reports-and-data/students/Pages/Student-Enrollment-Reports.aspx

# download.file("https://www.oregon.gov/ode/reports-and-data/students/Documents/fallmembershipreport_20222023.xlsx",
#               mode = "wb",
#               destfile = "data-raw/fallmembershipreport_20222023.xlsx")
# 
# download.file("https://www.oregon.gov/ode/reports-and-data/students/Documents/fallmembershipreport_20212022.xlsx",
#               mode = "wb",
#               destfile = "data-raw/fallmembershipreport_20212022.xlsx")
# 
# download.file("https://www.oregon.gov/ode/reports-and-data/students/Documents/fallmembershipreport_20202021.xlsx",
#               mode = "wb",
#               destfile = "data-raw/fallmembershipreport_20202021.xlsx")
# 
# download.file("https://www.oregon.gov/ode/reports-and-data/students/Documents/fallmembershipreport_20192020.xlsx",
#               mode = "wb",
#               destfile = "data-raw/fallmembershipreport_20192020.xlsx")
# 
# download.file("https://www.oregon.gov/ode/reports-and-data/students/Documents/fallmembershipreport_20182019.xlsx",
#               mode = "wb",
#               destfile = "data-raw/fallmembershipreport_20182019.xlsx")

# Import Data -------------------------------------------------------------

enrollment_2022_2023 <- read_excel(path = "data-raw/fallmembershipreport_20222023.xlsx",
                                   sheet = "School 2022-23") |> 
  clean_names()

enrollment_2021_2022 <- read_excel(path = "data-raw/fallmembershipreport_20212022.xlsx",
                                   sheet = "School 2021-22") |> 
  clean_names()

# Tidy and Clean Data -----------------------------------------------------

enrollment_by_race_ethnicity_2022_2023 <-
  enrollment_2022_2023 |> 
  select(district_institution_id, school_institution_id,
         x2022_23_american_indian_alaska_native:x2022_23_multi_racial) |> 
  select(-contains("percent")) |> 
  pivot_longer(cols = -c(district_institution_id, school_institution_id),
               names_to = "race_ethnicity",
               values_to = "number_of_students") |> 
  mutate(race_ethnicity = str_remove(race_ethnicity, pattern = "x2022_23_")) |> 
  mutate(race_ethnicity = case_when(
    race_ethnicity == "american_indian_alaska_native" ~ "American Indian Alaska Native",
    race_ethnicity == "asian" ~ "Asian",
    race_ethnicity == "black_african_american" ~ "Black/African American",
    race_ethnicity == "hispanic_latino" ~ "Hispanic/Latino",
    race_ethnicity == "multiracial" ~ "Multi-Racial",
    race_ethnicity == "native_hawaiian_pacific_islander" ~ "Native Hawaiian Pacific Islander",
    race_ethnicity == "white" ~ "White",
    race_ethnicity == "multi_racial" ~ "Multiracial"
  )) |> 
  mutate(number_of_students = parse_number(number_of_students)) |> 
  group_by(district_institution_id, race_ethnicity) |> 
  summarize(number_of_students = sum(number_of_students, na.rm = TRUE)) |> 
  ungroup() |> 
  group_by(district_institution_id) |> 
  mutate(pct = number_of_students / sum(number_of_students)) |> 
  ungroup() |> 
  mutate(year = YOURCODEHERE)

enrollment_by_race_ethnicity_2021_2022 <-
  enrollment_2021_2022 |> 
  select(attending_district_institution_id, attending_school_institution_id,
         YOURCODEHERE:YOURCODEHERE) |> 
  rename(YOURCODEHERE = attending_district_institution_id,
         YOURCODEHERE = attending_school_institution_id) |> 
  select(-contains("percent")) |> 
  pivot_longer(cols = -c(district_institution_id, school_institution_id),
               names_to = "race_ethnicity",
               values_to = "number_of_students") |> 
  mutate(race_ethnicity = str_remove(race_ethnicity, pattern = YOURCODEHERE)) |> 
  mutate(race_ethnicity = case_when(
    race_ethnicity == "american_indian_alaska_native" ~ "American Indian Alaska Native",
    race_ethnicity == "asian" ~ "Asian",
    race_ethnicity == "black_african_american" ~ "Black/African American",
    race_ethnicity == "hispanic_latino" ~ "Hispanic/Latino",
    race_ethnicity == "multiracial" ~ "Multi-Racial",
    race_ethnicity == "native_hawaiian_pacific_islander" ~ "Native Hawaiian Pacific Islander",
    race_ethnicity == "white" ~ "White",
    race_ethnicity == "multi_racial" ~ "Multiracial"
  )) |> 
  mutate(number_of_students = parse_number(number_of_students)) |> 
  group_by(district_institution_id, race_ethnicity) |> 
  summarize(number_of_students = sum(number_of_students, na.rm = TRUE)) |> 
  ungroup() |> 
  group_by(district_institution_id) |> 
  mutate(pct = number_of_students / sum(number_of_students)) |> 
  ungroup() |> 
  mutate(year = YOURCODEHERE)

enrollment_by_race_ethnicity <-
  bind_rows(YOURCODEHERE,
            YOURCODEHERE)

Have any questions? Put them below and we will help you out!

You need to be signed-in to comment on this post. Login.

Olivia Noel • November 13, 2023

Hi David! I've noted that data frames need to have identical variable names in order to use bind_rows().. But do the variables also need to be in the same order? And do the variable types need to be the same -- or is it possible to force a bind that would allow the first data frame called to determine var type?

Libby Heeren Coach • November 15, 2023

Hey, Olivia! For bind_rows(), the columns can be in any order you like, as long as the column names match, but the data types must be the same. For this function, you can also have a different number of columns in each data frame - it will fill the missing data with NA values for you.