Im having a trouble with the time im spending to run this code:
i = 1k = 1j = 1database_sob <- data.frame()for(i in unique(database$treeids)){ aux <- database[database$treeids == i,] aux <- aux %>% arrange(aux$survey) if(!any(is.na(aux$cipo))){ if(sum(aux$cipo) > 0){ if(aux$cipo[length(aux$fazenda_seetree)]!= 1 | sum(aux$cipo) > 1 ){ aux_2 <- cumsum(aux$cipo) aux <- aux[(length(aux_2) - length(aux_2[aux_2>0]) + 1):length(aux_2),] aux_2 <- cumsum(aux$cipo) j = 1 while(!is.na(aux_2[j] != aux_2[j+1]) & (aux_2[j] != aux_2[j+1])) { j = j + 1 } database_sob[k,1] <- aux$treeids[1] database_sob[k,2] <- aux$survey[1] for(l in 2:length(aux$fazenda_seetree)){ print(i) if(aux$cipo[l] == 1 | aux$score[l] %in% c("morta","falha","replanta","saudavel")){ database_sob[k,3] <- aux$survey[l] if(aux$cipo[l] == 0 & aux$score[l] %in% c("saudavel")){ database_sob[k,4] <- 0 } else{ database_sob[k,4] <- 1 } break } else{ database_sob[k,3] <- aux$survey[l] database_sob[k,4] <- 1 } } k = k + 1 } } }}
My database has about 6 million rows, and even when running it on Databricks, I'm spending a lot of time waiting for it to complete
I need some ideas or alternatives to make this run faster. PySpark? Rust? What do you suggest?