Tidy Data Rule #3: Every Cell is a Single Value

This lesson is locked

Get access to all lessons in this course.

This lesson is called Tidy Data Rule #3: Every Cell is a Single Value, part of the Going Deeper with R course. This lesson is called Tidy Data Rule #3: Every Cell is a Single Value, part of the Going Deeper with R course.

If the video is not playing correctly, you can watch it in a new window

Transcript

Click on the transcript to go to that point in the video. Please note that transcripts are auto generated and may contain minor inaccuracies.

View code shown in video

# Load Packages -----------------------------------------------------------

library(tidyverse)
library(janitor)
library(readxl)
library(fs)

# Create Directory --------------------------------------------------------

dir_create("data-raw")

# Download Data -----------------------------------------------------------

# download.file(url = "https://www.oregon.gov/ode/educator-resources/assessment/Documents/TestResults2122/pagr_schools_math_tot_raceethnicity_2122.xlsx",
#               mode = "wb",
#               destfile = "data-raw/pagr_schools_math_tot_raceethnicity_2122.xlsx")
# 
# download.file("https://www.oregon.gov/ode/educator-resources/assessment/Documents/TestResults2122/TestResults2019/pagr_schools_math_tot_raceethnicity_1819.xlsx",
#               mode = "wb",
#               destfile = "data-raw/pagr_schools_math_tot_raceethnicity_1819.xlsx")
# 
# download.file("https://www.oregon.gov/ode/educator-resources/assessment/TestResults2018/pagr_schools_math_raceethnicity_1718.xlsx",
#               mode = "wb",
#               destfile = "data-raw/pagr_schools_math_raceethnicity_1718.xlsx")
# 
# download.file("https://www.oregon.gov/ode/educator-resources/assessment/TestResults2017/pagr_schools_math_raceethnicity_1617.xlsx",
#               mode = "wb",
#               destfile = "data-raw/pagr_schools_math_raceethnicity_1617.xlsx")
# 
# download.file("https://www.oregon.gov/ode/educator-resources/assessment/TestResults2016/pagr_schools_math_raceethnicity_1516.xlsx",
#               mode = "wb",
#               destfile = "data-raw/pagr_schools_math_raceethnicity_1516.xlsx")

# Import Data -------------------------------------------------------------

math_scores_2021_2022 <-
  read_excel("data-raw/pagr_schools_math_tot_raceethnicity_2122.xlsx") |> 
  clean_names()

third_grade_math_proficiency_2021_2022 <-
  math_scores_2021_2022 |> 
  filter(student_group == "Total Population (All Students)") |> 
  filter(grade_level == "Grade 3") |> 
  select(academic_year, school_id, contains("number_level_")) |> 
  pivot_longer(cols = contains("number_level_"),
               names_to = "proficiency_level",
               values_to = "number_of_students")

third_grade_math_proficiency_2021_2022 |> 
  mutate(proficiency_level = str_remove(proficiency_level,
                                        pattern = "number_level_"))

third_grade_math_proficiency_2021_2022 |> 
  mutate(proficiency_level = recode(proficiency_level, 
                                    "number_level_4" = "4",
                                    "number_level_3" = "3",
                                    "number_level_2" = "2",
                                    "number_level_1" = "1"))

third_grade_math_proficiency_2021_2022 |> 
  mutate(proficiency_level = if_else(proficiency_level == "number_level_4", 
                                     true = "4", 
                                     false = proficiency_level)) |> 
  mutate(proficiency_level = if_else(proficiency_level == "number_level_3", 
                                     true = "3", 
                                     false = proficiency_level)) |> 
  mutate(proficiency_level = if_else(proficiency_level == "number_level_2", 
                                     true = "2", 
                                     false = proficiency_level)) |> 
  mutate(proficiency_level = if_else(proficiency_level == "number_level_1", 
                                     true = "1", 
                                     false = proficiency_level))

third_grade_math_proficiency_2021_2022 |> 
  mutate(proficiency_level = case_when(
    proficiency_level == "number_level_4" ~ "4",
    proficiency_level == "number_level_3" ~ "3",
    proficiency_level == "number_level_2" ~ "2",
    proficiency_level == "number_level_1" ~ "1"
  ))

third_grade_math_proficiency_2021_2022 |> 
  mutate(proficiency_level = parse_number(proficiency_level))

Your Turn

Solution

# Load Packages -----------------------------------------------------------

library(tidyverse)
library(fs)
library(readxl)
library(janitor)

# Create Directories ------------------------------------------------------

dir_create("data-raw")

# Download Data -----------------------------------------------------------

# https://www.oregon.gov/ode/reports-and-data/students/Pages/Student-Enrollment-Reports.aspx

# download.file("https://www.oregon.gov/ode/reports-and-data/students/Documents/fallmembershipreport_20222023.xlsx",
#               mode = "wb",
#               destfile = "data-raw/fallmembershipreport_20222023.xlsx")
# 
# download.file("https://www.oregon.gov/ode/reports-and-data/students/Documents/fallmembershipreport_20212022.xlsx",
#               mode = "wb",
#               destfile = "data-raw/fallmembershipreport_20212022.xlsx")
# 
# download.file("https://www.oregon.gov/ode/reports-and-data/students/Documents/fallmembershipreport_20202021.xlsx",
#               mode = "wb",
#               destfile = "data-raw/fallmembershipreport_20202021.xlsx")
# 
# download.file("https://www.oregon.gov/ode/reports-and-data/students/Documents/fallmembershipreport_20192020.xlsx",
#               mode = "wb",
#               destfile = "data-raw/fallmembershipreport_20192020.xlsx")
# 
# download.file("https://www.oregon.gov/ode/reports-and-data/students/Documents/fallmembershipreport_20182019.xlsx",
#               mode = "wb",
#               destfile = "data-raw/fallmembershipreport_20182019.xlsx")

# Import Data -------------------------------------------------------------

enrollment_2022_2023 <- read_excel(path = "data-raw/fallmembershipreport_20222023.xlsx",
                                   sheet = "School 2022-23") |> 
  clean_names()

# Tidy and Clean Data -----------------------------------------------------

enrollment_by_race_ethnicity_2022_2023 <-
  enrollment_2022_2023 |> 
  select(district_institution_id, school_institution_id,
         x2022_23_american_indian_alaska_native:x2022_23_multi_racial) |> 
  select(-contains("percent")) |> 
  pivot_longer(cols = -c(district_institution_id, school_institution_id),
               names_to = "race_ethnicity",
               values_to = "number_of_students") |> 
  mutate(race_ethnicity = str_remove(race_ethnicity, pattern = "x2022_23_")) |> 
  mutate(race_ethnicity = case_when(
    race_ethnicity == "american_indian_alaska_native" ~ "American Indian Alaska Native",
    race_ethnicity == "asian" ~ "Asian",
    race_ethnicity == "black_african_american" ~ "Black/African American",
    race_ethnicity == "hispanic_latino" ~ "Hispanic/Latino",
    race_ethnicity == "multiracial" ~ "Multi-Racial",
    race_ethnicity == "native_hawaiian_pacific_islander" ~ "Native Hawaiian Pacific Islander",
    race_ethnicity == "white" ~ "White",
    race_ethnicity == "multi_racial" ~ "Multiracial"
  ))

enrollment_by_race_ethnicity_2022_2023 |> 
  mutate(race_ethnicity = str_remove(race_ethnicity,
                                     pattern = "x2022_23_"))

enrollment_by_race_ethnicity_2022_2023 |> 
  mutate(race_ethnicity = str_remove(race_ethnicity, pattern = "x2022_23_")) |> 
  mutate(race_ethnicity = recode(race_ethnicity,
                                 "american_indian_alaska_native" = "American Indian Alaska Native",
                                 "asian" = "Asian",
                                 "black_african_american" = "Black/African American",
                                 "hispanic_latino" = "Hispanic/Latino",
                                 "multiracial" = "Multi-Racial",
                                 "native_hawaiian_pacific_islander" = "Native Hawaiian Pacific Islander",
                                 "white" = "White",
                                 "multi_racial" = "Multiracial"))

enrollment_by_race_ethnicity_2022_2023 |> 
  mutate(race_ethnicity = str_remove(race_ethnicity, pattern = "x2022_23_")) |> 
  mutate(race_ethnicity = if_else(race_ethnicity == "american_indian_alaska_native", 
                                  true = "American Indian Alaska Native",
                                  false = race_ethnicity)) |> 
  mutate(race_ethnicity = if_else(race_ethnicity == "asian", 
                                  true = "Asian",
                                  false = race_ethnicity)) |> 
  mutate(race_ethnicity = if_else(race_ethnicity == "black_african_american", 
                                  true = "Black/African American",
                                  false = race_ethnicity)) |> 
  mutate(race_ethnicity = if_else(race_ethnicity == "hispanic_latino", 
                                  true = "Hispanic/Latino",
                                  false = race_ethnicity)) |> 
  mutate(race_ethnicity = if_else(race_ethnicity == "multiracial", 
                                  true = "Multi-Racial",
                                  false = race_ethnicity)) |> 
  mutate(race_ethnicity = if_else(race_ethnicity == "native_hawaiian_pacific_islander", 
                                  true = "Native Hawaiian Pacific Islander",
                                  false = race_ethnicity)) |> 
  mutate(race_ethnicity = if_else(race_ethnicity == "white", 
                                  true = "White",
                                  false = race_ethnicity)) |> 
  mutate(race_ethnicity = if_else(race_ethnicity == "multi_racial", 
                                  true = "Multiracial",
                                  false = race_ethnicity))

enrollment_by_race_ethnicity_2022_2023 |> 
  mutate(race_ethnicity = str_remove(race_ethnicity, pattern = "x2022_23_")) |> 
  mutate(race_ethnicity = case_when(
    race_ethnicity == "american_indian_alaska_native" ~ "American Indian Alaska Native",
    race_ethnicity == "asian" ~ "Asian",
    race_ethnicity == "black_african_american" ~ "Black/African American",
    race_ethnicity == "hispanic_latino" ~ "Hispanic/Latino",
    race_ethnicity == "multiracial" ~ "Multi-Racial",
    race_ethnicity == "native_hawaiian_pacific_islander" ~ "Native Hawaiian Pacific Islander",
    race_ethnicity == "white" ~ "White",
    race_ethnicity == "multi_racial" ~ "Multiracial"
  ))

Remove the “x2022_23_” portion of the race_ethnicity variable using str_remove().
Convert all instances of the race_ethnicity variable to more meaningful observations (e.g. turn “american_indian_alaska_native” into “American Indian/Alaska Native”) using recode(), if_else(), or case_when().

Use the following starter code to help you if necessary.

# Load Packages -----------------------------------------------------------

library(tidyverse)
library(fs)
library(readxl)
library(janitor)

# Create Directories ------------------------------------------------------

dir_create("data-raw")

# Download Data -----------------------------------------------------------

# https://www.oregon.gov/ode/reports-and-data/students/Pages/Student-Enrollment-Reports.aspx

# download.file("https://www.oregon.gov/ode/reports-and-data/students/Documents/fallmembershipreport_20222023.xlsx",
#               mode = "wb",
#               destfile = "data-raw/fallmembershipreport_20222023.xlsx")
# 
# download.file("https://www.oregon.gov/ode/reports-and-data/students/Documents/fallmembershipreport_20212022.xlsx",
#               mode = "wb",
#               destfile = "data-raw/fallmembershipreport_20212022.xlsx")
# 
# download.file("https://www.oregon.gov/ode/reports-and-data/students/Documents/fallmembershipreport_20202021.xlsx",
#               mode = "wb",
#               destfile = "data-raw/fallmembershipreport_20202021.xlsx")
# 
# download.file("https://www.oregon.gov/ode/reports-and-data/students/Documents/fallmembershipreport_20192020.xlsx",
#               mode = "wb",
#               destfile = "data-raw/fallmembershipreport_20192020.xlsx")
# 
# download.file("https://www.oregon.gov/ode/reports-and-data/students/Documents/fallmembershipreport_20182019.xlsx",
#               mode = "wb",
#               destfile = "data-raw/fallmembershipreport_20182019.xlsx")

# Import Data -------------------------------------------------------------

enrollment_2022_2023 <- read_excel(path = "data-raw/fallmembershipreport_20222023.xlsx",
                                   sheet = "School 2022-23") |> 
  clean_names()

# Tidy and Clean Data -----------------------------------------------------

enrollment_by_race_ethnicity_2022_2023 <-
  enrollment_2022_2023 |> 
  select(district_institution_id, x2022_23_american_indian_alaska_native:x2022_23_percent_multi_racial) |> 
  select(-contains("percent")) |> 
  pivot_longer(cols = -district_institution_id,
               names_to = "race_ethnicity",
               values_to = "number_of_students")

Learn More

Bob Rudis has a semi-complex article on how he uses case_when() to work with data related to his work on internet security.

You might also find this video by Sharon Machlis on case_when() helpful.

And, in case you think I’m the only one who loves case_when() check out this love letter to the function by Matt Kerlogue.

Have any questions? Put them below and we will help you out!

You need to be signed-in to comment on this post. Login.

Joseph Lehman

April 23, 2024

We may get to this in the future or already covered it, but is there a way to drop characters either at the beginning or end of a cell? I am thinking something like the "RIGHT" and "LEFT" functions in Excel.

David Keyes Founder

April 24, 2024

Yup, check out this quick video! Code I used in it is below:

library(tidyverse)
library(palmerpenguins)

penguins |> 
  select(sex) |> 
  mutate(sex_clipped = str_sub(sex, start = 1, end = 1))

Hope this helps!

Going Deeper with R

Advanced Data Wrangling

Advanced Data Visualization

Quarto