The purpose of this script is to add Stata variable and value labels
to IPEDS data in R and save the result. One could also use Stata to
(1) read in the data, (2) assign the labels, (3) save the result as a
*.dta
file, and then (4) use the
haven
package to read
the labelled data into R. This script represents a pure R
solution. You will need the
tidyverse
and
labelled
packages.
To use, download the Stata version of each required data file from
IPEDS
(*_Data_Stata.zip
, which is really just a CSV file) along with its
accompanying labelling file, *_Stata.zip
). Do not unzip the
files. By default, the script assumes these two sets of files and all
output will be in the same directory. If you want to use separate
folders for the Stata data, Stata label, and output R data files, you
can set the paths in the script.
(You can batch download the IPEDS files you need using
downloadipeds.r
. Simply
comment out or erase the files you don’t want in the accompanying
ipeds_file_list.txt
and make sure you set the option stata_data ==
TRUE
in the main script.)
Due to changes across IPEDS data files over the years, it’s probable
that some files will not be labelled correctly. That said the script does
run (that is, assign labels) for all IPEDS zip files available as of
mid-June 2018.
################################################################################
##
## <PROJ> Add variable / value labels to IPEDS data in R
## <FILE> label_ipeds.r
## <AUTH> Benjamin Skinner @btskinner
## <INIT> 12 July 2018
##
################################################################################
## USAGE -----------------------------------------------------------------------
##
## (1) download relevant Stata data and label files from IPEDS (leave zipped)
##
## - Stata data: *_Data_Stata.zip
## - Stata labels: *_Stata.zip
##
## (2) change input/output directories below if desired
##
## (3) run
##
## NB: You can download zipped IPEDS files using < downloadipeds.r > script @
## https://github.com/btskinner/downloadipeds
## -----------------------------------------------------------------------------
## -----------------------------------------------------------------------------
## SET I/O DIRECTORIES (DEFAULT = everything in the current directory)
## -----------------------------------------------------------------------------
## If directory structure like this EXAMPLE:
##
## ./
## |__/r_data
## |
## |__/stata_data
## | |-- ADM2014_Data_Stata.zip
## | |-- ADM2015_Data_Stata.zip
## |
## |__/stata_labels
## | |-- ADM2014_Stata.zip
## | |-- ADM2015_Stata.zip
## |
## |-- label_ipeds.r
##
## Then:
##
## labs_ddir <- file.path('.', 'stata_labels')
## stata_ddir <- file.path('.', 'stata_data')
## r_ddir <- file.path('.', 'r_data')
labs_ddir <- file.path('.') # path to folder w/ zipped label files
stata_ddir <- file.path('.') # path to folder w/ zipped Stata data
r_ddir <- file.path('.') # path to output folder for Rdata files
## -----------------------------------------------------------------------------
## WANT NOISIER OUTPUT? (DEFAULT = FALSE)
## -----------------------------------------------------------------------------
## allow readr::read_csv() messages?
noisy <- FALSE
## -----------------------------------------------------------------------------
## LIBRARIES & FUNCTIONS
## -----------------------------------------------------------------------------
## libraries
libs <- c('tidyverse','labelled')
lapply(libs, require, character.only = TRUE)
read_zip <- function(zipfile, type, noisy) {
## create a name for the dir where we'll unzip
zipdir <- tempfile()
## create the dir using that name
dir.create(zipdir)
## unzip the file into the dir
unzip(zipfile, exdir = zipdir)
## get the files into the dir
files <- list.files(zipdir, recursive = TRUE)
## chose rv file if more than two b/c IPEDS likes revisions
if (length(files) > 1) {
file <- grep('*_rv_*', tolower(files), value = TRUE)
if (length(file) == 0) {
file <- grep('*\\.csv', files, value = TRUE)
}
} else {
file <- files[1]
}
## get the full name of the file
file <- file.path(zipdir, file)
## read the file
if (type == 'csv') {
if (noisy) {
out <- read_csv(file)
} else {
out <- suppressMessages(suppressWarnings(read_csv(file,
progress = FALSE)))
}
} else {
out <- readLines(file, encoding = 'latin1')
}
## remove tmp
unlink(zipdir, recursive = TRUE)
## return
return(out)
}
read_labels <- function(zipfile) {
## read in label file
labs <- read_zip(zipfile, 'do')
## get insheet line and add one to get next line
line_no <- grep('insheet', labs) + 1
## drop header
labs <- labs[line_no:length(labs)]
## drop first asterisk
labs <- gsub('^\\*(.+)$', '\\1', labs)
## return
return(labs)
}
assign_var_labels <- function(df, label_vec) {
## get variable label lines
varlabs <- grep('^label variable', label_vec, value = TRUE)
## if no labels, exit
if (length(varlabs) == 0) { return(df) }
## get variables that have labels
vars <- unlist(lapply(varlabs, function(x) { strsplit(x, ' ')[[1]][[3]] }))
## get the labels belonging to those variables
labs <- gsub('label variable .+"(.+)"', '\\1', varlabs)
## create list
varlabs <- setNames(as.list(labs), vars)
## assign to variables
var_label(df) <- varlabs
## return new data frame
return(df)
}
assign_val_labels <- function(df, label_vec) {
## get value label lines
vallabs <- grep('^label define', label_vec, value = TRUE)
## if no labels, exit
if (length(vallabs) == 0) { return(df) }
## get unique defined labels
labdefs <- unique(gsub('^label define (\\w+).+', '\\1', vallabs))
## get label value lines
vars <- grep('^label values', label_vec, value = TRUE)
## make list of variable plus its value definition
vardef <- setNames(as.list(gsub('^label values (\\w+).+', '\\1', vars)),
gsub('^label values \\w+ (\\w+)\\*?.*', '\\1', vars))
## make unique b/c of some double labels
vardef <- vardef[!duplicated(vardef)]
## loop through each variable
for (i in 1:length(labdefs)) {
## get label
labdef <- labdefs[i]
## skip if missing
if (!is.null(vardef[[labdef]])) {
## subset lines with this definition
pattern <- paste0('\\b', labdef, '\\b')
vallab <- grep(pattern, vallabs, value = TRUE)
## get values
pattern <- paste0('label define ', labdef, ' +(-?\\w+).+')
values <- gsub(pattern, '\\1', vallab)
## convert values to class of variable...hacky fix here
suppressWarnings(class(values) <- class(df[[vardef[[labdef]]]]))
## get labels
pattern <- paste0('label define ', labdef, ' .+"(.+)" ?(, ?add ?)?')
labels <- gsub(pattern, '\\1', vallab)
## make list
labels <- setNames(values, labels)
## label values
df[[vardef[[labdef]]]] <- labelled(df[[vardef[[labdef]]]], labels)
}
}
## return dataframe
return(df)
}
assign_imp_labels <- function(df, label_vec) {
## find line numbers surrounding imputation values
line_no_start <- grep('imputation.*variable(s)?', label_vec) + 1
## if no imputation labels, exit
if (length(line_no_start) == 0) { return(df) }
line_no_stop <- grep('^tab\\b', label_vec)[[1]] - 1
labs <- label_vec[line_no_start:line_no_stop]
## get variables starting with 'x'
vars <- df %>% select(starts_with('x')) %>% names(.)
## make list of each impute value and label
values <- gsub('(\\w\\b).+', '\\1', labs)
labels <- gsub('\\w\\b (.+)', '\\1', labs)
labels <- setNames(values, labels)
## loop through each imputed variable
for (v in vars) {
if (class(df[[v]]) == class(values)) {
df[[v]] <- labelled(df[[v]], labels)
}
}
## return dataframe
return(df)
}
## -----------------------------------------------------------------------------
## RUN BY LOOPING THROUGH FILES
## -----------------------------------------------------------------------------
## get list of zip files
stata_zip <- grep('*_Data_Stata\\.zip', list.files(stata_ddir), value = TRUE)
stata_lab <- grep('_Stata\\.zip', list.files(labs_ddir), value = TRUE)
## if stata_ddir and labs_ddir are the same, subset
if (identical(stata_ddir, labs_ddir)) {
stata_lab <- stata_lab[!(stata_lab %in% stata_zip)]
}
## loop
for (i in 1:length(stata_zip)) {
f <- stata_zip[i]
## message
message(paste0('Working with: ', f))
## get basename
fname <- gsub('(^.+)_Data_Stata.zip', '\\1', f)
## get label file
lab_file <- grep(paste0('^', fname, '_Stata'), stata_lab, value = TRUE)
## skip if missing label file
if (length(lab_file) == 0) {
message(paste0(' NO LABEL FILE FOR: ', fname, ', skipping'))
next
}
## read in data
df <- read_zip(file.path(stata_ddir, f), 'csv', noisy) %>%
rename_all(tolower)
## get labels
labs <- read_labels(file.path(labs_ddir, lab_file))
## assign variable labels
df <- assign_var_labels(df, labs)
## assign value labels
df <- assign_val_labels(df, labs)
## assign imputation labels
df <- assign_imp_labels(df, labs)
## rename data frame to match file name
assign(tolower(fname), df)
## save
save(list = tolower(fname),
file = file.path(r_ddir, paste0(fname, '.Rdata')))
## garbage collect every 10 loops...may help...idk
if (i %% 10 == 0) { gc() }
}
## =============================================================================
## END SCRIPT
################################################################################