这些是我拥有的一个大文件的前 10 行:(请注意,这 10 行中只有一个用户,但我有数千个用户)

dput(testd) 
structure(list(user = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L 
), otime = structure(c(10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L 
), .Label = c("2010-10-12T19:56:49Z", "2010-10-13T03:57:23Z",  
"2010-10-13T16:41:35Z", "2010-10-13T20:05:43Z", "2010-10-13T23:31:51Z",  
"2010-10-14T00:21:47Z", "2010-10-14T18:25:51Z", "2010-10-16T03:48:54Z",  
"2010-10-16T06:02:04Z", "2010-10-17T01:48:53Z"), class = "factor"),  
    lat = c(39.747652, 39.891383, 39.891077, 39.750469, 39.752713,  
      39.752508, 39.7513, 39.758974, 39.827022, 39.749934), 
    long = c(-104.99251, -105.070814, -105.068532, -104.999073,  
      -104.996337, -104.996637, -105.000121, -105.010853, 
      -105.143191, -105.000017), 
    locid = structure(c(5L, 4L, 9L, 6L, 1L, 2L, 8L, 3L, 10L, 7L), 
      .Label = c("2ef143e12038c870038df53e0478cefc",  
      "424eb3dd143292f9e013efa00486c907", "6f5b96170b7744af3c7577fa35ed0b8f",  
      "7a0f88982aa015062b95e3b4843f9ca2", "88c46bf20db295831bd2d1718ad7e6f5",  
      "9848afcc62e500a01cf6fbf24b797732f8963683", "b3d356765cc8a4aa7ac5cd18caafd393",  
      "d268093afe06bd7d37d91c4d436e0c40d217b20a", "dd7cd3d264c2d063832db506fba8bf79",  
      "f6f52a75fd80e27e3770cd3a87054f27"), class = "factor"), 
    dnt = structure(c(10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L), 
      .Label = c("2010-10-12 19:56:49",  
      "2010-10-13 03:57:23", "2010-10-13 16:41:35", "2010-10-13 20:05:43",  
      "2010-10-13 23:31:51", "2010-10-14 00:21:47", "2010-10-14 18:25:51",  
      "2010-10-16 03:48:54", "2010-10-16 06:02:04", "2010-10-17 01:48:53" 
    ), class = "factor"), 
    x = c(-11674.6344476781, -11683.3414552141,  
      -11683.0877083915, -11675.3642199817, -11675.0599906624,  
      -11675.0933491404, -11675.4807522648, -11676.6740962175,  
      -11691.3894104198, -11675.4691879924), 
    y = c(4419.73724843345, 4435.719406435, 4435.68538078744, 
      4420.05048454181, 4420.3000059572, 4420.27721099723, 
      4420.14288752585, 4420.99619739292, 4428.56278976123,  
      4419.99099525605), 
    cellx = structure(c(1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L), 
      .Label = c("[-11682,-11672)", "[-11692,-11682)" 
    ), class = "factor"), 
    celly = structure(c(1L, 2L, 2L, 1L,  
      1L, 1L, 1L, 1L, 1L, 1L), .Label = c("[4419,4429)", "[4429,4439)" 
    ), class = "factor"), 
    cellxy = structure(c(1L, 3L, 3L, 1L,  
      1L, 1L, 1L, 1L, 2L, 1L), .Label = c("[-11682,-11672)[4419,4429)",  
      "[-11692,-11682)[4419,4429)", "[-11692,-11682)[4429,4439)" 
    ), class = "factor")), .Names = c("user", "otime", "lat",  
"long", "locid", "dnt", "x", "y", "cellx", "celly", "cellxy"), class = "data.frame", row.names = c(NA,  
-10L)) 

关于数据是什么的一些解释,以简化理解。 x 和 y 是 lat 和 long 坐标的变换。我已使用 cut 将 x,y 位置离散化为 bin .我想获得每个用户访问次数最多的 bin,所以我使用 ddply .如下:
cells = ddply(testd, .(user, cellxy), summarise, length(cellxy)) 

获得:
dput(cells) 
structure(list(user = c(0, 0, 0), cellxy = structure(1:3, .Label = c("[-11682,-11672)[4419,4429)",  
"[-11692,-11682)[4419,4429)", "[-11692,-11682)[4429,4439)"), class = "factor"),  
    count = c(7L, 1L, 2L)), .Names = c("user", "cellxy", "count" 
), row.names = c(NA, -3L), class = "data.frame") 

现在我要做的是从第一个数据集中计算每个用户访问次数最多的 bin 的平均 x,y 值,这些值是从前面的计算中获得的。我不知道如何有效地做到这一点,鉴于我的数据集非常大,我希望能得到一些指导。谢谢!

请您参考如下方法:

这是两阶段的方法。首先,修改你原来的代码cells - 对于cellxy 的每个组合和 user计算平均值 xy值(value)。

 cells = ddply(testd, .(user, cellxy), summarise, 
       cellcount=length(cellxy),meanx=mean(x),meany=mean(y)) 
 cells 
  user                     cellxy cellcount     meanx    meany 
1    0 [-11682,-11672)[4419,4429)         7 -11675.40 4420.214 
2    0 [-11692,-11682)[4419,4429)         1 -11691.39 4428.563 
3    0 [-11692,-11682)[4429,4439)         2 -11683.21 4435.702 

然后使用其他调用 ddply()为每个用户设置子集 cellxy最高 cellcount .
cells2 = ddply(cells,.(user),subset,cellcount==max(cellcount)) 
cells2 
  user                     cellxy cellcount    meanx    meany 
1    0 [-11682,-11672)[4419,4429)         7 -11675.4 4420.214 


评论关闭
IT干货网

微信公众号号:IT虾米 (左侧二维码扫一扫)欢迎添加!