Skip to content
R for the Rest of Us Logo

Putting Everything Together

This lesson is locked

Get access to all lessons in this course.

If the video is not playing correctly, you can watch it in a new window

Transcript

Click on the transcript to go to that point in the video. Please note that transcripts are auto generated and may contain minor inaccuracies.

A slightly reorganized version of the the code used in this lesson is below.

## %######################################################%##
#                                                          #
####            Putting everything together             ####
####        Chained data cleaning demonstration         ####
#                                                          #
## %######################################################%##


# Load the raw Age of Empires units dataset from csv (aoe_raw.csv)
# Identify and fix common issues that make these data unusable

# load packages -----------------------------------------------------------
library(readr)
library(dplyr)
library(unheadr)
library(stringr)
library(tidyr)

# load data ---------------------------------------------------------------
aoe_raw <- read_csv("data/aoe_raw.csv")

# cleanup -----------------------------------------------------------------
# fix names
aoer <- aoe_raw %>% mash_colnames(2)

# multi line values
aoer <-
  aoer %>%
  unbreak_vals(
    regex = "^\\(", ogcol = Name, newcol = Name_ub,
    sep = " "
  ) %>%
  rename(Name = Name_ub)
# tag subheaders
subhtags <- aoer %>%
  filter(across(Type:`Armor_(Melee/Pierce)`, is.na)) %>%
  select(Name) %>%
  mutate(nametag = paste0("Building-", Name))

# join labels and untangle embedded subheaders
aoerint <- left_join(aoer, subhtags) %>% # gets joined by name
  untangle2("^Building-", nametag, Building) %>%
  select(-nametag) %>%
  mutate(Building = stringr::str_remove(Building, "^Building-")) %>%
  relocate(Building, .after = Name)

# split costs
aoerint <-
  aoerint %>%
  mutate(
    cost_wood = stringr::str_extract(Cost, "(\\d+)(?=W)"),
    cost_food = stringr::str_extract(Cost, "(\\d+)(?=F)"),
    cost_gold = stringr::str_extract(Cost, "(\\d+)(?=G)")
  ) %>%
  relocate(starts_with("cost_"), .after = Cost) %>%
  select(-Cost)


# separate armor types
aoerint <-
  aoerint %>% separate(`Armor_(Melee/Pierce)`, into = c("melee_armor", "pierce_armor"), sep = "\\/")

# name for observational units
aoerint <- aoerint %>% rename(unit = Name)

# clean variable names
aoerintc <- aoerint %>% janitor::clean_names()

# fix accuracy column
aoerintc <- aoerintc %>%
  mutate(accuracy = str_remove(accuracy, "%")) %>%
  readr::type_convert() %>%
  mutate(accuracy = accuracy * .01)

# issue with text as numbers
# 'none' to 0
aoerintc <- aoerintc %>% mutate(min_range = str_replace(min_range, "none", "0"))

# remove whitespace
aoerintc <- aoerintc %>% mutate(across(everything(), str_squish))

# sort out NA vals
aoerintc <-
  aoerintc %>%
  mutate(across(everything(), na_if, "-")) %>%
  mutate(across(everything(), na_if, "Non-trainable")) %>%
  mutate(across(everything(), na_if, ""))

# re-parse and keep only distinct rows
aoerintt <-
  aoerintc %>%
  readr::type_convert() %>%
  distinct()

# informative object name
AOEunits <- aoerintt
AOEunits
summary(AOEunits)

Have any questions? Put them below and we will help you out!

You need to be signed-in to comment on this post. Login.