def Newdf data = dfcopydeep=True dataseq_min=0 dataseq_max=0 dataA = 0 dataB = 0 dataC = 0 dataD = 0 dataE = 0 dataF = 0 dataG = 0 dataH = 0 st = timeperf_counter
可以考虑使用向量化操作,避免逐行迭代操作DataFrame,可以大大提高效率。
首先,可以使用groupby操作,将数据按照Index分组。然后,可以使用apply方法,对每个分组进行操作。在操作过程中,可以使用nsmallest和nlargest方法获取第二小和第二大的值,使用merge方法将这些值与分组合并。最后,可以使用loc方法将计算结果赋值给data的相应列。
以下是可能的优化代码:
def New(df):
data = df.copy(deep=True)
data["seq_min"] = data.groupby(['Index'])['Via_Seq'].transform('min')
data["seq_max"] = data.groupby(['Index'])['Via_Seq'].transform('max')
group_min = data.groupby(['Index', 'Via_Seq'])['Arrive_TmS'].min().reset_index().rename(columns={'Arrive_TmS': 'A', 'Via_Seq': 'seq_min'})
group_max = data.groupby(['Index', 'Via_Seq'])['Deptr_TmS'].max().reset_index().rename(columns={'Deptr_TmS': 'H', 'Via_Seq': 'seq_max'})
group_second_smallest = data.groupby(['Index'])['Via_Seq'].nsmallest(2).reset_index()
group_second_largest = data.groupby(['Index'])['Via_Seq'].nlargest(2).reset_index()
group_second_smallest = group_second_smallest.loc[group_second_smallest['level_1'] == 1, ['Index', 'Via_Seq']]
group_second_largest = group_second_largest.loc[group_second_largest['level_1'] == 1, ['Index', 'Via_Seq']]
group_second_smallest = group_second_smallest.rename(columns={'Via_Seq': 'second_smallest'})
group_second_largest = group_second_largest.rename(columns={'Via_Seq': 'second_largest'})
data = data.merge(group_min, on=['Index', 'seq_min'], how='left')
data = data.merge(group_max, on=['Index', 'seq_max'], how='left')
data = data.merge(group_second_smallest, on='Index', how='left')
data = data.merge(group_second_largest, on='Index', how='left')
data["C"] = data.loc[data['Via_Seq'] == data['second_smallest'], 'Arrive_TmS']
data["D"] = data.loc[data['Via_Seq'] == data['second_smallest'], 'Deptr_TmS']
data["E"] = data.loc[data['Via_Seq'] == data['second_largest'], 'Arrive_TmS']
data["F"] = data.loc[data['Via_Seq'] == data['second_largest'], 'Deptr_TmS']
data["G"] = data.loc[data['Via_Seq'] == data['seq_max'], 'Arrive_TmS']
data['firstFlag'] = data.apply(lambda x: Noequ(x['Via_Seq'], x['seq_min']), axis=1)
data['lastFlag'] = data.apply(lambda x: Noequ(x['Via_Seq'], x['seq_max']), axis=1)
return data
这个优化代码的时间复杂度为$O(nlogn)$,比原来的逐行迭代的算法复杂度低得多,应该可以大幅提高程序的效率
原文地址: http://www.cveoy.top/t/topic/cT35 著作权归作者所有。请勿转载和采集!