对于嵌套循环执行速度较慢,且R中有一系列小错误

人气:328 发布:2022-10-16 标签: performance r time-series nested-loops tibble

问题描述

我在DATA_SENSOR中有一个包含106个时序的列表。每个Tibble都有两列,分别包含日期和温度。

另一方面,我在DATE_ADMIN中有一个包含106个日期的列表,其中包含我希望时间序列以Tibble结束的日期。

代码运行正常,但使用嵌套的for循环会花费太多时间,因为平均行数接近每个Tibble的10000行。

library(tidyverse)
library(dplyr)

#List nesting all the dataframes of all the xls files
files <- dir("C:/User*inals", pattern = "\.xls$", full.names = TRUE)
data_sensor <- lapply(files, read_xls)

##List nesting all the dataframes of all the xlsx files
filesx <- dir("C:/Us******ls", pattern = "\.xlsx$", full.names = TRUE)
data_generic <- lapply(filesx, read_xlsx)

idxend=vector()
for (i in seq_along(data_sensor)){
  for (j in seq_along(data_sensor[[i]][[1]])){
    if (as.Date(data_sensor[[i]][[1]][[j]]) < as.Date(date_admin[i])){
      data_sensor[[i]][[1]][[j]] = data_sensor[[i]][[1]][[j]]
    } else{ #Convert all the elements after condition to NA's
        data_sensor[[i]][[1]][[j]] = NA
        data_sensor[[i]][[2]][[j]] = NA
    }
}
#Drop all NA's
for (i in seq_along(data_sensor)){
  data_sensor[[i]] = drop_na(data_sensor[[i]])
}
}

为了澄清我列出的小问题和向量:

> data_sensor[[1]][[1]][[1]]
[1] "2018-08-07 11:00:31 UTC"
> data_sensor[[1]][[2]][[1]]
[1] 6.3
> data_sensor[[2]][[1]][[1]]
[1] "2018-08-08 11:56:05 UTC" 
#data_sensor[[index of list]][[column of tibble(date,Temperature)]][[row of tibble]]
> date_admin
  [1] "2018-10-07 UTC" "2018-12-29 UTC" "2018-12-13 UTC" "2019-08-09 UTC" "2019-10-10 UTC"
  [6] "2019-04-26 UTC" "2018-11-21 UTC" "2018-08-23 UTC" "2019-07-08 UTC" "2019-11-19 UTC"
 [11] "2019-11-07 UTC" "2018-09-05 UTC" "2018-09-03 UTC" "2018-09-24 UTC" "2018-10-11 UTC"
 [16] "2018-09-25 UTC" "2019-03-29 UTC" "2018-08-20 UTC" "2018-09-17 UTC" "2019-03-30 UTC"
 [21] "2018-11-07 UTC" "2019-01-01 UTC" "2018-08-31 UTC" "2019-03-27 UTC" "2019-11-10 UTC"
 [26] "2019-04-04 UTC" "2019-10-18 UTC" "2018-09-06 UTC" "2018-09-23 UTC" "2018-09-22 UTC"
 [31] "2019-07-22 UTC" "2018-09-04 UTC" "2019-05-17 UTC" "2018-11-05 UTC" "2018-12-09 UTC"
 [36] "2018-09-03 UTC" "2019-05-21 UTC" "2019-02-22 UTC" "2018-08-30 UTC" "2019-06-04 UTC"
 [41] "2018-09-13 UTC" "2018-10-14 UTC" "2019-11-08 UTC" "2018-08-30 UTC" "2019-04-12 UTC"
 [46] "2018-09-24 UTC" "2018-08-22 UTC" "2018-08-30 UTC" "2018-09-07 UTC" "2018-11-11 UTC"
 [51] "2018-11-01 UTC" "2018-10-01 UTC" "2018-10-22 UTC" "2018-12-03 UTC" "2019-06-06 UTC"
 [56] "2018-09-09 UTC" "2018-09-10 UTC" "2018-09-24 UTC" "2018-10-11 UTC" "2018-11-30 UTC"
 [61] "2018-09-20 UTC" "2019-11-20 UTC" "2018-10-11 UTC" "2018-10-09 UTC" "2018-09-27 UTC"
 [66] "2019-11-11 UTC" "2018-10-04 UTC" "2018-09-14 UTC" "2019-04-27 UTC" "2018-09-04 UTC"
 [71] "2018-09-11 UTC" "2018-08-14 UTC" "2018-09-01 UTC" "2018-10-01 UTC" "2018-09-25 UTC"
 [76] "2018-09-28 UTC" "2018-09-29 UTC" "2018-10-11 UTC" "2019-03-26 UTC" "2018-10-26 UTC"
 [81] "2018-11-21 UTC" "2018-12-02 UTC" "2018-09-08 UTC" "2019-01-08 UTC" "2018-11-07 UTC"
 [86] "2019-02-05 UTC" "2019-01-21 UTC" "2018-09-11 UTC" "2018-12-17 UTC" "2019-01-15 UTC"
 [91] "2018-08-28 UTC" "2019-01-08 UTC" "2019-05-14 UTC" "2019-01-21 UTC" "2018-11-12 UTC"
 [96] "2018-10-26 UTC" "2019-12-26 UTC" "2020-01-03 UTC" "2020-01-06 UTC" "2020-02-26 UTC"
[101] "2020-02-14 UTC" "2020-01-27 UTC" "2020-01-21 UTC" "2020-03-16 UTC" "2020-02-26 UTC"
[106] "2019-12-31 UTC"

data_sensor[[1]]
                   date Temperature
1   2018-08-07 11:00:31         6.3
2   2018-08-07 11:10:31        11.4
3   2018-08-07 11:20:31        12.0
4   2018-08-07 11:30:31        13.7
5   2018-08-07 11:40:31        15.6
6   2018-08-07 11:50:31        13.6
7   2018-08-07 12:00:31        12.2
8   2018-08-07 12:10:31        11.2
9   2018-08-07 12:20:31        11.6
...............................
...............................
...............................
499 2018-08-10 22:00:31         9.7
500 2018-08-10 22:10:31         9.6
 [ reached 'max' / getOption("max.print") -- omitted 8592 rows ]
通过嵌套的for循环清理数据需要几分钟时间。如何提高代码的性能?

实现答案时出错:

    > data_sensor = 
+   tibble(
+     file = paste("file",1:length(date_admin)),
+     date_admin = date_admin
+   ) %>% 
+   mutate(data_sensor = map(file, ~data_sensor))

> data_sensor
# A tibble: 106 x 3
   file    date_admin          data_sensor 
   <chr>   <dttm>              <list>      
 1 file 1  2018-10-07 00:00:00 <list [106]>
 2 file 2  2018-12-29 00:00:00 <list [106]>
 3 file 3  2018-12-13 00:00:00 <list [106]>

我的数据传感器在实现代码之前的类是list,在它之后变成:

[1] "tbl_df" "tbl" "data.frame"

错误出现在该区块中:

> data_sensor = data_sensor %>% 
+   group_by(file) %>% 
+   group_modify(~f(.x))
 Error in UseMethod("mutate") : 
  no applicable method for 'mutate' applied to an object of class "list" 
> class(data_sensor)
[1] "tbl_df"     "tbl"        "data.frame"
> data_sensor = data_sensor %>% 
+   group_by(file) %>% 
+   group_modify(~f(.x))
 Error in UseMethod("mutate") : 
  no applicable method for 'mutate' applied to an object of class "list"

推荐答案

绝对不要循环!!对于这样的操作,有更有效的方法。我来教你怎么做。但首先我需要生成一些数据。为此,我创建了两个小函数。rndDate将开始日期从2018年1月1日随机化到2020年12月31日,而fDateSensor每10分钟返回一次时间序列。

rndDate = function(start_date=ymd("20180101"), end_date=ymd("20201231")){ 
  sample(seq(start_date, end_date, "days"), 1)}

fDateSensor = function(n) tibble(
    date = rndDate() + 1:n*dminutes(10),
    Temperature = rnorm(n)
  )

fDateSensor(5)

输出

# A tibble: 5 x 2
  date                Temperature
  <dttm>                    <dbl>
1 2019-09-27 00:10:00      -0.511
2 2019-09-27 00:20:00       0.531
3 2019-09-27 00:30:00       1.42 
4 2019-09-27 00:40:00       0.252
5 2019-09-27 00:50:00      -0.570

现在我要制作一个内部Tibble的Tibble。首先,对于两个日期,date_admin

nDateSensor = 10
set.seed(1234)
date_admin = c("2018-10-07", "2019-07-29")
data_sensor = 
  tibble(
    file = paste("file",1:length(date_admin)),
    date_admin = date_admin
  ) %>% 
  mutate(data_sensor = map(file, ~fDateSensor(nDateSensor)))
data_sensor

输出

# A tibble: 2 x 3
  file   date_admin data_sensor      
  <chr>  <chr>      <list>           
1 file 1 2018-10-07 <tibble [10 x 2]>
2 file 2 2019-07-29 <tibble [10 x 2]>

如您所见,我模拟了读取两个文件。它们的内容在大小为10x2的变量data_sensor中。

data_sensor$data_sensor
[[1]]
# A tibble: 10 x 2
   date                Temperature
   <dttm>                    <dbl>
 1 2020-10-14 00:10:00      0.314 
 2 2020-10-14 00:20:00      0.359 
 3 2020-10-14 00:30:00     -0.730 
 4 2020-10-14 00:40:00      0.0357
 5 2020-10-14 00:50:00      0.113 
 6 2020-10-14 01:00:00      1.43  
 7 2020-10-14 01:10:00      0.983 
 8 2020-10-14 01:20:00     -0.622 
 9 2020-10-14 01:30:00     -0.732 
10 2020-10-14 01:40:00     -0.517 

[[2]]
# A tibble: 10 x 2
   date                Temperature
   <dttm>                    <dbl>
 1 2019-07-28 00:10:00     -0.776 
 2 2019-07-28 00:20:00      0.0645
 3 2019-07-28 00:30:00      0.959 
 4 2019-07-28 00:40:00     -0.110 
 5 2019-07-28 00:50:00     -0.511 
 6 2019-07-28 01:00:00     -0.911 
 7 2019-07-28 01:10:00     -0.837 
 8 2019-07-28 01:20:00      2.42  
 9 2019-07-28 01:30:00      0.134 
10 2019-07-28 01:40:00     -0.491 

现在是最重要的时刻。我们将构建一个函数f以根据您的预期修改我们的内部tibble

f = function(data) {
  data$data_sensor[[1]] = data$data_sensor[[1]] %>% mutate(
    date = ifelse(date<data$date_admin, NA, date) %>% as_datetime(),
    Temperature = ifelse(date<data$date_admin, NA, Temperature)
  )  
  data %>% mutate(nNA = sum(is.na(data$data_sensor[[1]]$date)))
}

data_sensor = data_sensor %>% 
  group_by(file) %>% 
  group_modify(~f(.x))

data_sensor$data_sensor

输出

data_sensor$data_sensor
[[1]]
# A tibble: 10 x 2
   date                Temperature
   <dttm>                    <dbl>
 1 2020-10-14 00:10:00      0.314 
 2 2020-10-14 00:20:00      0.359 
 3 2020-10-14 00:30:00     -0.730 
 4 2020-10-14 00:40:00      0.0357
 5 2020-10-14 00:50:00      0.113 
 6 2020-10-14 01:00:00      1.43  
 7 2020-10-14 01:10:00      0.983 
 8 2020-10-14 01:20:00     -0.622 
 9 2020-10-14 01:30:00     -0.732 
10 2020-10-14 01:40:00     -0.517 

[[2]]
# A tibble: 10 x 2
   date   Temperature
   <dttm> <lgl>      
 1 NA     NA         
 2 NA     NA         
 3 NA     NA         
 4 NA     NA         
 5 NA     NA         
 6 NA     NA         
 7 NA     NA         
 8 NA     NA         
 9 NA     NA         
10 NA     NA         

如您所见,一切运行正常。 此外,我们的f函数除了data_sensor突变外,还返回NA观测的数量。

# A tibble: 2 x 4
# Groups:   file [2]
  file   date_admin data_sensor         nNA
  <chr>  <chr>      <list>            <int>
1 file 1 2018-10-07 <tibble [10 x 2]>     0
2 file 2 2019-07-29 <tibble [10 x 2]>    10

所以现在是时候在更大的数据上测试它了。在这里,我使用了您的date_admin矢量,绘制了包含100000个观测值的图!

date_admin = c(
  "2018-10-07", "2018-12-29", "2018-12-13", "2019-08-09", "2019-10-10",
  "2019-04-26", "2018-11-21", "2018-08-23", "2019-07-08", "2019-11-19",
  "2019-11-07", "2018-09-05", "2018-09-03", "2018-09-24", "2018-10-11",
  "2018-09-25", "2019-03-29", "2018-08-20", "2018-09-17", "2019-03-30",
  "2018-11-07", "2019-01-01", "2018-08-31", "2019-03-27", "2019-11-10",
  "2019-04-04", "2019-10-18", "2018-09-06", "2018-09-23", "2018-09-22",
  "2019-07-22", "2018-09-04", "2019-05-17", "2018-11-05", "2018-12-09",
  "2018-09-03", "2019-05-21", "2019-02-22", "2018-08-30", "2019-06-04",
  "2018-09-13", "2018-10-14", "2019-11-08", "2018-08-30", "2019-04-12",
  "2018-09-24", "2018-08-22", "2018-08-30", "2018-09-07", "2018-11-11",
  "2018-11-01", "2018-10-01", "2018-10-22", "2018-12-03", "2019-06-06",
  "2018-09-09", "2018-09-10", "2018-09-24", "2018-10-11", "2018-11-30",
  "2018-09-20", "2019-11-20", "2018-10-11", "2018-10-09", "2018-09-27",
  "2019-11-11", "2018-10-04", "2018-09-14", "2019-04-27", "2018-09-04",
  "2018-09-11", "2018-08-14", "2018-09-01", "2018-10-01", "2018-09-25",
  "2018-09-28", "2018-09-29", "2018-10-11", "2019-03-26", "2018-10-26",
  "2018-11-21", "2018-12-02", "2018-09-08", "2019-01-08", "2018-11-07",
  "2019-02-05", "2019-01-21", "2018-09-11", "2018-12-17", "2019-01-15",
  "2018-08-28", "2019-01-08", "2019-05-14", "2019-01-21", "2018-11-12",
  "2018-10-26", "2019-12-26", "2020-01-03", "2020-01-06", "2020-02-26",
  "2020-02-14", "2020-01-27", "2020-01-21", "2020-03-16", "2020-02-26",
  "2019-12-31")

nDateSensor = 100000
set.seed(1234)

data_sensor = 
  tibble(
    file = paste("file",1:length(date_admin)),
    date_admin = date_admin
  ) %>% 
  mutate(data_sensor = map(file, ~fDateSensor(nDateSensor)))

输出

 data_sensor
# A tibble: 106 x 3
   file    date_admin data_sensor           
   <chr>   <chr>      <list>                
 1 file 1  2018-10-07 <tibble [100,000 x 2]>
 2 file 2  2018-12-29 <tibble [100,000 x 2]>
 3 file 3  2018-12-13 <tibble [100,000 x 2]>
 4 file 4  2019-08-09 <tibble [100,000 x 2]>
 5 file 5  2019-10-10 <tibble [100,000 x 2]>
 6 file 6  2019-04-26 <tibble [100,000 x 2]>
 7 file 7  2018-11-21 <tibble [100,000 x 2]>
 8 file 8  2018-08-23 <tibble [100,000 x 2]>
 9 file 9  2019-07-08 <tibble [100,000 x 2]>
10 file 10 2019-11-19 <tibble [100,000 x 2]>
# ... with 96 more rows

突变的时间到了。我们将立即测量需要多长时间。

start_time =Sys.time()
data_sensor = data_sensor %>% 
  group_by(file) %>% 
  group_modify(~f(.x))
Sys.time()-start_time

对我来说,这花了2.3秒。我不知道你是否预料到了这样的时间,但这似乎是一个相当好的结果。 让我们看看data_sensor是什么样子。

# A tibble: 106 x 4
# Groups:   file [106]
   file     date_admin data_sensor              nNA
   <chr>    <chr>      <list>                 <int>
 1 file 1   2018-10-07 <tibble [100,000 x 2]>     0
 2 file 10  2019-11-19 <tibble [100,000 x 2]> 19001
 3 file 100 2020-02-26 <tibble [100,000 x 2]> 95897
 4 file 101 2020-02-14 <tibble [100,000 x 2]>  7769
 5 file 102 2020-01-27 <tibble [100,000 x 2]> 99497
 6 file 103 2020-01-21 <tibble [100,000 x 2]>     0
 7 file 104 2020-03-16 <tibble [100,000 x 2]> 50969
 8 file 105 2020-02-26 <tibble [100,000 x 2]>     0
 9 file 106 2019-12-31 <tibble [100,000 x 2]> 13673
10 file 11  2019-11-07 <tibble [100,000 x 2]> 16697
# ... with 96 more rows

如您所见,某些数据已更改为NA。所以一切都很顺利。 您只需将XLS文件名读入data_sensor,然后使用group_by (file) group_modify将数据加载到变量data_sensor中。祝你好运!

676