A benchmark (adding sub_a and sub_b creation into Sotos and Heikki answers so everyone start on the same initial vectors a of 800 observations and b of 1000 observations).
Running the benchmark with:
library(dplyr)
library(data.table)
library(microbenchmark)
a <- sample(100:999, 8e3, TRUE)
b <- sample(100:999, 1e4, TRUE)
microbenchmark(Jaap1(a,b), Jaap2(a,b), Tensi(a,b), Heikki(a,b), Sotos(a,b),
Matt_base(a,b), Matt_dplyr(a,b), Docendo(a,b),
zx8754(a,b), zx8754for(a,b), Frank(a,b),
times = 5, unit = 'relative')
gives:
Unit: relative
expr min lq mean median uq max neval cld
Jaap1(a, b) 34.703161 34.555558 32.159832 33.589581 33.379529 26.404050 5 d
Jaap2(a, b) 4.632965 4.757991 4.526898 4.703065 4.855419 3.879726 5 a
Tensi(a, b) 280.223144 282.671462 262.658752 273.782450 273.316759 217.585132 5 g
Heikki(a, b) 80.749127 80.611073 79.538881 82.291796 85.075395 71.342298 5 f
Sotos(a, b) 26.127738 26.050240 24.299225 25.205191 25.262141 20.166425 5 c
Matt_base(a, b) 33.349114 33.438351 31.153113 32.459036 31.932295 26.181568 5 d
Matt_dplyr(a, b) 1.546961 1.540109 1.481780 1.527166 1.575283 1.280775 5 a
Docendo(a, b) 1.366530 1.399819 1.328550 1.363766 1.348267 1.204104 5 a
zx8754(a, b) 15.751249 16.257225 14.970995 15.755592 15.601824 12.322439 5 b
zx8754for(a, b) 46.335615 48.606064 45.353230 48.128807 47.942506 37.997405 5 e
Frank(a, b) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 5 a
The used functions:
Jaap1 <- function(a,b) {
a <- setNames(a, substr(a,2,3))
b <- setNames(b, substr(b,1,2))
df <- merge(stack(a), stack(b), by = 'ind')
paste0(substr(df$values.x,1,1), df$values.y)
}
Jaap2 <- function(a,b) {
a <- setNames(a, substr(a,2,3))
b <- setNames(b, substr(b,1,2))
l <- lapply(names(a), function(x) b[x == names(b)])
paste0(substr(rep(a, lengths(l)),1,1), unlist(l))
}
Tensi <- function(a,b) {
unlist(sapply(a,function(x) {regex <- paste0(substr(x,2,3),'(\\d)'); z <- sub(regex,paste0(x,"\\1"),b); z[!b %in% z] } ))
}
Heikki <- function(a,b) {
sub_a <- substr(a, 2, 3)
sub_b <- substr(b, 1, 2)
result <- c()
for (ai in a) {
sub_ai <- substr(ai,2,3)
if (sub_ai %in% sub_a) {
b_match <- (sub_b == sub_ai)
result <- c(result,paste0(ai,substr(b[b_match],3,4)))
}
}
result
}
Sotos <- function(a,b) {
sub_a <- substr(a, 2, 3)
sub_b <- substr(b, 1, 2)
d1 <- expand.grid(a, b, stringsAsFactors = FALSE)
d2 <- expand.grid(sub_a, sub_b, stringsAsFactors = FALSE)
i1 <- d2$Var1 == d2$Var2
d1 <- d1[i1,]
d1$Var1 <- substr(d1$Var1, 1, 1)
do.call(paste0, d1)
}
Matt_base <- function(a,b) {
a1 <- data.frame(a)
b1 <- data.frame(b)
a1$first_a = substr(a1$a, 1, 1)
a1$last_a = substr(a1$a, 2, 3)
b1$first_b = substr(b1$b, 1, 2)
b1$last_b = substr(b1$b, 3, 3)
c1 <- merge(a1, b1, by.x = "last_a", by.y = "first_b")
results <- paste0(c1$a, c1$last_b)
}
Matt_dplyr <- function(a,b) {
a1 <- data.frame(a)
b1 <- data.frame(b)
a1 <- a1 %>% mutate(first_a = substr(a, 1, 1), last_a = substr(a, 2, 3))
b1 <- b1 %>% mutate(first_b = substr(b, 1, 2), last_b = substr(b, 3, 3))
c1 <- inner_join(a1, b1, by = c("last_a" = "first_b"))
results <- paste0(c1$a, c1$last_b)
}
Docendo <- function(a, b) {
split_a <- split(a, substr(a, 2, 3))
split_b <- split(substr(b, 3, 3), substr(b, 1, 2))
idx <- intersect(names(split_a), names(split_b))
stopifnot(length(idx) > 0)
unlist(Map(function(x,y) outer(x, y, paste0), split_a[idx], split_b[idx]),
use.names = FALSE)
}
zx8754 <- function(a, b) {
unlist(sapply(a, function(i) i * 10 + (b %% 10)[i %% 100 == b %/% 10]))
}
zx8754for <- function(a, b) {
res <- integer()
for(i in a) res <- c(res, i * 10 + (b %% 10)[i %% 100 == b %/% 10])
res
}
Frank <- function(a, b) {
aDT <- as.data.table(tstrsplit(a, ""))
bDT <- setnames(as.data.table(tstrsplit(b, "")), c("V2", "V3", "V4"))
merge(aDT, bDT, allow.cartesian = TRUE)[, paste0(V1, V2, V3, V4)]
}
d1 <- expand.grid(a, b); d2 <- expand.grid(sub_a, sub_b); i1 <- d2$Var1 == d2$Var2; do.call(paste0, d1[i1,])and then remove duplicates from each string – Sotos 6 hours agomatchwill suit you there as it stops after first match. Probably more likesub_a %in% sub– Cath 6 hours agocommonpart is repeated twice in the final output. Can we make it more efficient? I am already using so many extra variables (sub_a,sub_b,common), adding two more would be expensive? – Ronak Shah 6 hours ago