## 提升R代码运算效率的11个实用方法

| |
[ 所属分类 商业智能 | 发布者 店小二04 | 时间 | 作者 红领巾 ] 0人收藏点击收藏

# Create the data frame

col1 <- runif (12^5, 0, 2)

col2 <- rnorm (12^5, 0, 2)

col3 <- rpois (12^5, 3)

col4 <- rchisq (12^5, 2)

df <- data.frame (col1, col2, col3, col4)

# Original R code: Before vectorization and pre-allocation

system.time({

for (i in 1:nrow(df)) { # for every row

if ((df[i, 'col1'] + df[i, 'col2'] + df[i, 'col3'] + df[i, 'col4']) > 4) { # check if > 4 df[i, 5] <- "greater_than_4" # assign 5th column

} else {

df[i, 5] <- "lesser_than_4" # assign 5th column

}

}

})

1.向量化处理和预设数据库结构

for (i in 1:nrow(df)) {

if ((df[i, 'col1'] + df[i, 'col2'] + df[i, 'col3'] + df[i, 'col4']) > 4) { output[i] <- "greater_than_4"

} else {

output[i] <- "lesser_than_4"

}

}

df\$output})

2.将条件语句判断条件移至循环外

# after vectorization and pre-allocation, taking the condition checking outside the loop.

output <- character (nrow(df))

condition <- (df\$col1 + df\$col2 + df\$col3 + df\$col4) > 4 # condition check outside the loop

system.time({

for (i in 1:nrow(df)) {

if (condition[i]) { output[i] <- "greater_than_4"

} else {

output[i] <- "lesser_than_4"

}

}

df\$output <- output

})

3.只在条件语句为真时执行循环过程

output <- c(rep("lesser_than_4", nrow(df)))

condition <- (df\$col1 + df\$col2 + df\$col3 + df\$col4) > 4

system.time({

for (i in (1:nrow(df))[condition]) { # run loop only for true conditions if (condition[i]) { output[i] <- "greater_than_4"

}

}

df\$output

})

4.尽可能地使用 ifelse() 语句

system.time({

output <- ifelse ((df\$col1 + df\$col2 + df\$col3 + df\$col4) > 4, "greater_than_4", "lesser_than_4")

df\$output <- output

})

5.使用 which() 语句

# Thanks to Gabe Becker

system.time({

want = which(rowSums(df) > 4)

output = rep("less than 4", times = nrow(df))

output[want] = "greater than 4"

})

# nrow = 3 Million rows (approx)

user system elapsed

0.396 0.074 0.481

6.用 apply 族函数替代 for 循环语句

# apply family

system.time({

myfunc <- function(x) {

if ((x['col1'] + x['col2'] + x['col3'] + x['col4']) > 4) {

"greater_than_4"

} else {

"lesser_than_4"

}

}

output <- apply(df[, c(1:4)], 1, FUN=myfunc) # apply 'myfunc' on every row

df\$output <- output

})

7.利用compiler包编译函数cmpfun()

# byte code compilation

library(compiler)

myFuncCmp <- cmpfun(myfunc)

system.time({

output <- apply(df[, c (1:4)], 1, FUN=myFuncCmp)

})

8.利用Rcpp

library(Rcpp)

sourceCpp("MyFunc.cpp")

system.time (output <- myFunc(df)) # see Rcpp function below

// Source for MyFunc.cpp

#include

using namespace Rcpp;

// [[Rcpp::export]]

CharacterVector myFunc(DataFrame x) {

NumericVector col1 = as(x["col1"]); NumericVector col2 = as(x["col2"]); NumericVector col3 = as(x["col3"]); NumericVector col4 = as(x["col4"]);

int n = col1.size();

CharacterVector out(n);

for (int i=0; i 4){

out[i] = "greater_than_4";

} else {

out[i] = "lesser_than_4";

}

}

return out;

}

9.利用并行运算

# parallel processing

library(foreach)

library(doSNOW)

cl <- makeCluster(4, type="SOCK") # for 4 cores machine

registerDoSNOW (cl)

condition <- (df\$col1 + df\$col2 + df\$col3 + df\$col4) > 4

# parallelization with vectorization

system.time({

output <- foreach(i = 1:nrow(df), .combine=c) %dopar% {

if (condition[i]) {

return("greater_than_4")

} else {

return("lesser_than_4")

}

}

})

df\$output <- output

10.尽早移除变量并恢复内存容量

11.利用内存较小的数据结构

data.table()是一个很好的例子，因为它可以减少数据的内存，这有助于加快运算速率。

dt <- data.table(df) # create the data.table

system.time({

for (i in 1:nrow (dt)) {

if ((dt[i, col1] + dt[i, col2] + dt[i, col3] + dt[i, col4]) > 4) { dt[i, col5:="greater_than_4"] # assign the output as 5th column

} else {

dt[i, col5:="lesser_than_4"] # assign the output as 5th column

}

}

})

ifelse：1752X，1500000行每秒

which：8806X，7540364行每秒

Rcpp：13476X，11538462行每秒

QQ群：418451831

tags: #160,df,lt,output,than,运算,col3,col1,col2,col4,time,nrow,gt

1.凡CodeSecTeam转载的文章,均出自其它媒体或其他官网介绍,目的在于传递更多的信息,并不代表本站赞同其观点和其真实性负责；
2.转载的文章仅代表原创作者观点,与本站无关。其原创性以及文中陈述文字和内容未经本站证实,本站对该文以及其中全部或者部分内容、文字的真实性、完整性、及时性，不作出任何保证或承若；
3.如本站转载稿涉及版权等问题,请作者及时联系本站,我们会及时处理。