'Split string column to create new binary columns
My data has one column and I am trying to create additional columns with what’s after each “/” in the rows. Here are the first few rows of the data:
> dput(mydata)
structure(list(ALL = structure(c(1L, 4L, 4L, 3L, 2L), .Label = c("/
ca/put/sent_1/fe.gr/eq2_on/eq2_off",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL", "/ca/put/sent_1/fe.g
r/eq2_on/eq2_off/cni_at.p3x.4",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov"), class = "factor
")), .Names = "ALL", class = "data.frame", row.names = c(NA,
-5L))
The result should look like this (data frame) with a “1” in the new column if the variable appears in the row and “0” if not:
> dput(Result)
structure(list(ALL = structure(c(1L, 4L, 5L, 3L, 2L), .Label = c("/ca
/put/sent_1/fe.gr/eq2_on/eq2_off",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL", "/ca/put/sent_1/fe.gr/
eq2_on/eq2_off/cni_at.p3x.4",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov", "/ca/put/sent_1fe.
gr/eq2_on/eq2_off/hi.on/hi.ov"
), class = "factor"), ca = c(1L, 1L, 1L, 1L, 1L), put = c(1L,
1L, 1L, 1L, 1L), sent_1 = c(1L, 1L, 1L, 1L, 1L), fe.gr = c(1L,
1L, 1L, 1L, 1L), eq2_on = c(1L, 1L, 1L, 1L, 1L), eq2_off = c(1L,
1L, 1L, 1L, 1L), hi.on = c(0L, 1L, 1L, 0L, 0L), hi.ov = c(0L,
1L, 1L, 0L, 0L), cni_at.p3x.4 = c(0L, 0L, 0L, 1L, 0L), cbr_LBL = c(0L
,
0L, 0L, 0L, 1L)), .Names = c("ALL", "ca", "put", "sent_1", "fe.gr",
"eq2_on", "eq2_off", "hi.on", "hi.ov", "cni_at.p3x.4", "cbr_LBL"
), class = "data.frame", row.names = c(NA, -5L))
I have tried many functions including strsplit and sapply:
sapply(strsplit(as.character(mydata$ALL), “\\/”), “[[“, 2) #returns "ca"s only
sapply(strsplit(as.character(mydata$ALL), "\\/"), "[[", 3) #returns "put"s only
There are millions of rows and I’d greatly appreciate anything that is quick and efficient.
Solution 1:[1]
You can use cSplit_e from my "splitstackshape" package:
library(splitstackshape)
cSplit_e(mydata, "ALL", "/", type = "character", fill = 0)
# ALL ALL_ca ALL_cbr_LBL
# 1 /ca/put/sent_1/fe.gr/eq2_on/eq2_off 1 0
# 2 /ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov 1 0
# 3 /ca/put/sent_1fe.gr/eq2_on/eq2_off/hi.on/hi.ov 1 0
# 4 /ca/put/sent_1/fe.gr/eq2_on/eq2_off/cni_at.p3x.4 1 0
# 5 /ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL 1 1
# ALL_cni_at.p3x.4 ALL_eq2_off ALL_eq2_on ALL_fe.gr ALL_hi.on ALL_hi.ov ALL_put
# 1 0 1 1 1 0 0 1
# 2 0 1 1 1 1 1 1
# 3 0 1 1 0 1 1 1
# 4 1 1 1 1 0 0 1
# 5 0 1 1 1 0 0 1
# ALL_sent_1 ALL_sent_1fe.gr
# 1 1 0
# 2 1 0
# 3 0 1
# 4 1 0
# 5 1 0
(Note: I think there's a problem in row 3 of your dput which is why it doesn't match your desired output. Notice that the third item in row 3 is "sent_1fe.gr" with no "/" between them.)
Solution 2:[2]
How about something like this
spt <- strsplit(as.character(mydata$ALL),"/", fixed=T)
do.call(rbind, lapply(lapply(spt, factor, levels=unique(unlist(spt))), table))
which returns
ca put sent_1 fe.gr eq2_on eq2_off hi.on hi.ov sent_1fe.gr cni_at.p3x.4 cbr_LBL
[1,] 1 1 1 1 1 1 1 0 0 0 0 0
[2,] 1 1 1 1 1 1 1 1 1 0 0 0
[3,] 1 1 1 1 0 1 1 1 1 1 0 0
[4,] 1 1 1 1 1 1 1 0 0 0 1 0
[5,] 1 1 1 1 1 1 1 0 0 0 0 1
Solution 3:[3]
Other option is to melt the split string in list to long form and then use table
library(reshape2)
as.data.frame.matrix(table(melt(strsplit(as.character(
mydata[[1]]), "/"))[2:1]))[,-1]
# ca eq2_off eq2_on fe.gr put sent_1 hi.on hi.ov sent_1fe.gr cni_at.p3x.4
#1 1 1 1 1 1 1 0 0 0 0
#2 1 1 1 1 1 1 1 1 0 0
#3 1 1 1 0 1 0 1 1 1 0
#4 1 1 1 1 1 1 0 0 0 1
#5 1 1 1 1 1 1 0 0 0 0
# cbr_LBL
#1 0
#2 0
#3 0
#4 0
#5 1
Solution 4:[4]
a tidyverse solution
library(tidyverse)
mydata %>%
rownames_to_column() %>%
mutate(key = strsplit(levels(ALL)[ALL],"/"),value=1) %>%
unnest %>%
spread(key,value,0) %>%
select(-rowname)
# ALL ca cbr_LBL cni_at.p3x.4 eq2_off eq2_on fe.gr hi.on hi.ov put sent_1
# 1 1 1 1 0 0 1 1 1 0 0 1 1
# 2 4 1 1 0 0 1 1 1 1 1 1 1
# 3 4 1 1 0 0 1 1 1 1 1 1 1
# 4 3 1 1 0 1 1 1 1 0 0 1 1
# 5 2 1 1 1 0 1 1 1 0 0 1 1
data
mydata <- structure(list(ALL = structure(c(1L, 4L, 4L, 3L, 2L), .Label = c(
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cni_at.p3x.4",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov"), class = "factor
")), .Names = "ALL", class = "data.frame", row.names = c(NA,-5L))
Solution 5:[5]
Here's a solution that uses dplyr and tidyr (note: I cleaned up what appears to be an missing / in row three of your sample data):
## Input
input <- structure(
list(ALL = structure(c(1L, 4L, 5L, 3L, 2L),
.Label = c("/ca/put/sent_1/fe.gr/eq2_on/eq2_off",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cni_at.p3x.4",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov"),
class = "factor")),
.Names = "ALL", class = "data.frame", row.names = c(NA, -5L))
## Solution
require(dplyr)
require(tidyr)
solution <- input %>%
mutate(temp = sub("^/", "", ALL)) %>%
separate(temp,
c("ca", "put", "sent_1", "fe.gr", "eq2_on", "eq2_off",
"hi.on", "hi.ov", "cni_at.p3x.4", "cbr_LBL"),
"/", extra="merge") %>%
mutate_each(funs(as.numeric(!is.na(.))), -ALL)
Solution 6:[6]
Having myself similar, but more complex, problem I figured the following functional way that allows to cast in 'one hot' way string seperated character values of a column/s individually as well as categorical variables itself:
oneHotOnText <- function(datatable, columns, seperator=", "){ #argument columns is character vector or numeric vector
if(! "data.table" %in% .packages()) if(!require(data.table)) { install.packages("data.table"); library(data.table) }
if(! "data.table" %in% class(datatable)) TempDT <- as.data.table(datatable) else TempDT <- copy(datatable)
for(i in TempDT[, columns, with = F]){
if(class(i) != "character") i <- as.character(i)
uniqueValues <- unique(unlist(strsplit(unique(i), split=seperator)))
if(any(uniqueValues %in% names(TempDT))) { print("Value/s of the selected column/s is/are present as variables name/s. Rename it/them.")
rm(TempDT)
break }
for(j in uniqueValues) TempDT[, (j) := ifelse(grepl(j, i), 1L, 0L)]
}
if(exists("TempDT")) return(TempDT)
}
DF = data.frame(
aColumn=rep(c("f", "b", "c"), 100000),
xColumn=rep(c("N/W", "W", "R"), 100000),
yColumn=rep(c("A/B", "A/V", "B/G"), 100000),
zColumn=rep(20:22, 100000))
str(DF) #factors are present in the data.frame
oneHotOnText(DF, columns = c("aColumn", "xColumn", "yColumn"), seperator="/")[] #applies the function, returns a data.table and prints the result
# aColumn xColumn yColumn zColumn f b c N W R A B V G
# 1: f N/W A/B 20 1 0 0 1 1 0 1 1 0 0
# 2: b W A/V 21 0 1 0 0 1 0 1 0 1 0
# 3: c R B/G 22 0 0 1 0 0 1 0 1 0 1
# 4: f N/W A/B 20 1 0 0 1 1 0 1 1 0 0
# 5: b W A/V 21 0 1 0 0 1 0 1 0 1 0
# ---
#299996: b W A/V 21 0 1 0 0 1 0 1 0 1 0
#299997: c R B/G 22 0 0 1 0 0 1 0 1 0 1
#299998: f N/W A/B 20 1 0 0 1 1 0 1 1 0 0
#299999: b W A/V 21 0 1 0 0 1 0 1 0 1 0
#300000: c R B/G 22 0 0 1 0 0 1 0 1 0 1
Similarly it applies to the OP´s problem:
input <- data.frame(ALL = c("/ca/put/sent_1/fe.gr/eq2_on/eq2_off",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cni_at.p3x.4",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL"
))
oneHotOnText(input, columns = "ALL", seperator = "/")[]
Solution 7:[7]
here is one more tidyverse solution
library(tidyverse)
nn <- max(str_count(df$ALL, "/"), na.rm = T)+1
df %>% mutate(row_id =row_number()) %>%
select(row_id, everything()) %>%
separate(ALL, into = paste("tag", 1:nn), sep = "/") %>%
pivot_longer(cols = paste("tag", 2:nn), names_to = "name", values_to = "val") %>%
filter(!is.na(val)) %>%
select(-`tag 1`) %>%
mutate(new=1) %>%
#group_by(row_id) %>%
pivot_wider(id_cols = row_id, names_from = val, values_from = new, values_fill =0)
# A tibble: 5 x 11
row_id ca put sent_1 fe.gr eq2_on eq2_off hi.on hi.ov cni_at.p3x.4
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 1 1 1 1 1 0 0 0
2 2 1 1 1 1 1 1 1 1 0
3 3 1 1 1 1 1 1 1 1 0
4 4 1 1 1 1 1 1 0 0 1
5 5 1 1 1 1 1 1 0 0 0
# ... with 1 more variable: cbr_LBL <dbl>
Solution 8:[8]
For multiple columns with commas separate multi-items hot encoding:
library(qdapTools)
#hot-encoding
mtabulate(apply(dataset[3451:3457], 1, function(x) strsplit(x, ",")))
cbind(dataset, mtabulate(strsplit(mydf$Info, ", ")))
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|---|
| Solution 1 | A5C1D2H2I1M1N2O1R2T1 |
| Solution 2 | MrFlick |
| Solution 3 | akrun |
| Solution 4 | moodymudskipper |
| Solution 5 | Daddy the Runner |
| Solution 6 | Patrik_P |
| Solution 7 | AnilGoyal |
| Solution 8 | benson23 |
