dataframe reshape

knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(message = FALSE)

library(tidyverse)
library(knitr)

data reshaping

https://tidyr.tidyverse.org/
tidying data
make data “one case per row”, “one feature per column”, “one value per cell”

data reshaping terminology
tidyr	gather	spread
reshape(2)	melt	cast
spreadsheets	unpivot	pivot
databases	fold	unfold

- simple transpose doesn't work - some contents are correct, some are not
- `gather()` - moves columns into rows - `spread()` - moves rows into columns

untidy data

seps <- read_csv("http://www.mm-c.me/mdsi/hospitals93to98.csv")
seps %>% dim()
seps %>% head()

problems: - PatientDays and Separations are together - years are colnames

step1 - gather()

push data in columns to rows
gather(data, key, value, key_range, na.rm, convert)
key = name of new “naming” variable (year)
value = name of new “result” variable (value)
convert = if TRUE, converts what looks like numeric to numeric, char to char
gather() moves colnames into a “key” column, gathers col values into a single “values” column

inprog <- seps %>% gather(., key=year, value=value, FY1993:FY1998); inprog %>% head()

step2 - spread()

spread value of a value column (value) across new columns
long to wide
spread(data, key, value, fill = NA, convert = FALSE, drop = TRUE, sep = NULL)
key = column you want to split apart (Field)
value = column you want to use to populate the new columns (value)
fill = if combinations doesn’t exist

rearrag <- spread(inprog, key = Field, value = value); rearrag %>% head()
# every row is a unique combination of col values

separate()

separate(data, col, into, sep)

rearrag %>% separate(., col = year, into = c("leader", "year"), sep = 2) %>% head()
rearrag %>% separate(., col = year, into = c("leader", "year"), sep = "19") %>% head()

whatever is in sep will be gone, so need preceded-by and followed-by stringR syntax

# https://stackoverflow.com/questions/9756360/split-character-data-into-numbers-and-letters
# (?<=[A-Za-z]) means preceded by a letter
# (?=[0-9]) means followed by a number
# the middle is where you want to separate
rearrag %>% separate(., col = year, into = c("leader", "year"), sep = "(?<=[A-Za-z])(?=[0-9])") %>% head()

unite()

unite(data, cols, col, sep)

sep_d = rearrag %>% separate(., col = year, into = c("leader", "year"), sep = "(?<=[A-Za-z])(?=[0-9])")
sep_d %>% unite(., leader, year, col = year, sep = "") %>% head()

# unite 3 columns
sep_d2 = sep_d %>% separate(., col = year, into = c("century", "year"), sep = "(?<=19)(?=9)"); sep_d2 %>% head()
sep_d2 %>% unite(., leader, century, year, col = year, sep = "_") %>% head()

EOF

Last updated on May 23, 2020

Edit this page