dataframe reshape

knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(message = FALSE)
library(tidyverse)
library(knitr)

data reshaping

data reshaping terminology
tidyr gather spread
reshape(2) melt cast
spreadsheets unpivot pivot
databases fold unfold

- simple transpose doesn't work - some contents are correct, some are not
- `gather()` - moves columns into rows - `spread()` - moves rows into columns

untidy data

seps <- read_csv("http://www.mm-c.me/mdsi/hospitals93to98.csv")
seps %>% dim()
seps %>% head()

problems: - PatientDays and Separations are together - years are colnames

step1 - gather()

  • push data in columns to rows
  • gather(data, key, value, key_range, na.rm, convert)
    key = name of new “naming” variable (year)
    value = name of new “result” variable (value)
    convert = if TRUE, converts what looks like numeric to numeric, char to char

  • gather() moves colnames into a “key” column, gathers col values into a single “values” column
inprog <- seps %>% gather(., key=year, value=value, FY1993:FY1998); inprog %>% head()

step2 - spread()

  • spread value of a value column (value) across new columns
  • long to wide
  • spread(data, key, value, fill = NA, convert = FALSE, drop = TRUE, sep = NULL)
    key = column you want to split apart (Field)
    value = column you want to use to populate the new columns (value)
    fill = if combinations doesn’t exist
rearrag <- spread(inprog, key = Field, value = value); rearrag %>% head()
# every row is a unique combination of col values

separate()

  • separate(data, col, into, sep)
rearrag %>% separate(., col = year, into = c("leader", "year"), sep = 2) %>% head()
rearrag %>% separate(., col = year, into = c("leader", "year"), sep = "19") %>% head()
  • whatever is in sep will be gone, so need preceded-by and followed-by stringR syntax
# https://stackoverflow.com/questions/9756360/split-character-data-into-numbers-and-letters
# (?<=[A-Za-z]) means preceded by a letter
# (?=[0-9]) means followed by a number
# the middle is where you want to separate
rearrag %>% separate(., col = year, into = c("leader", "year"), sep = "(?<=[A-Za-z])(?=[0-9])") %>% head()

unite()

  • unite(data, cols, col, sep)
sep_d = rearrag %>% separate(., col = year, into = c("leader", "year"), sep = "(?<=[A-Za-z])(?=[0-9])")
sep_d %>% unite(., leader, year, col = year, sep = "") %>% head()

# unite 3 columns
sep_d2 = sep_d %>% separate(., col = year, into = c("century", "year"), sep = "(?<=19)(?=9)"); sep_d2 %>% head()
sep_d2 %>% unite(., leader, century, year, col = year, sep = "_") %>% head()

EOF

Previous
Next