CodeBERT-Based Automated Patch Generation for Java Defects
python/nimport json/nimport os/nimport re/nimport subprocess/nimport time/nimport torch/nimport argparse/nfrom transformers import RobertaTokenizer, RobertaForMaskedLM/nfrom simple_template import generate_template, remove_redudant, generate_match_template, match_simple_operator/nfrom tool.logger import Logger/nfrom tool.fault_localization import get_location/nfrom tool.d4j import build_d4j1_2/nfrom validate_patches import GVpatches, UNIAPRpatches/nfrom bert_beam_search import BeamSearch/n/ndevice = /'cuda:0/' if torch.cuda.is_available() else /'cpu/'/n/n''' comment_remover接收一个字符串作为输入,并返回一个修改后的字符串,其中移除了注释。/n函数实现使用正则表达式来查找三种类型的注释:/n//单行注释 /* 多行注释 */ '...'和/'.../'中的字符串文本/n然后使用内部的replacer函数将匹配上的注释替换为一个空格。如果匹配的内容不是注释,则直接返回原始内容。最终,该函数返回一个没有注释的字符串。 '''/ndef comment_remover(text):/n def replacer(match):/n s = match.group(0)/n if s.startswith('/'):/n return /' /' # note: a space and not an empty string/n else:/n return s/n/n pattern = re.compile(/n r'//.*?$|//*.*?/*/|/'(?://.|[^///'])*/'|///'(?://.|[^///'])*///'',/n re.DOTALL | re.MULTILINE/n )/n return re.sub(pattern, replacer, text)/n/n''' 在给定的文件中的指定行之前添加一行新代码,并返回生成的建议补丁列表。/n '''/n #该函数的输入包括:文件名(file)、要添加新代码的行号(line_loc)、分词器(tokenizer)、预训练模型(model)、波束搜索宽度(beam_width)、是否进行rerank(re_rank)以及最大建议补丁数量(top_n_patches)。其中,分词器和预训练模型用于在生成建议补丁时对上下文进行编码和解码/ndef add_new_line(file, line_loc, tokenizer, model, beam_width, re_rank=True, top_n_patches=-1):/n #函数首先读取指定文件的所有行,并将其存储到data变量中。然后,函数根据指定行的位置将文件内容分成三部分:插入点之前的代码(pre_code)、插入点之后的代码(post_code)以及要替换的旧代码(old_code)。/n with open(file, 'r', encoding='utf-8', errors='ignore') as f:/n data = f.readlines()/n pre_code = data[:line_loc]/n post_code = data[line_loc:]/n old_code = data[line_loc].strip()/n/n #接下来,函数使用掩码标记(mask_token)在新代码行中创建一个占位符,并将其初始化为20个掩码令牌。然后,函数递增地减少line_size,直到生成的输入(由pre_code_input、masked_line和post_code_input连接而成)的长度小于490个令牌。/n mask_token = /'<mask>/'/n masked_line = /' /' + mask_token * 20 + /' /'/n line_size = 100/n while (1):/n #从pre_code中取最后line_size行的文本,并在开头加上/'</s>/',然后用空格把每个单词隔开,组成一个字符串pre_code_input。/n pre_code_input = /'</s> /' + /' /'.join(/n [x.strip() for x in pre_code[-line_size:]])/n #从post_code中取前line_size行的文本,并且将其中的换行符/'//n/'去掉,用空格把每个单词隔开,组成一个字符串post_code_input。/n post_code_input = /' /'.join([x.strip() for x in post_code[0:line_size]]).replace(/'//n/', '').strip()/n #将pre_code_input、masked_line和post_code_input连接起来,用tokenizer进行编码,并返回编码后的输入ID序列。如果这个序列的长度小于490,则跳出循环/n if tokenizer(pre_code_input + masked_line + post_code_input, return_tensors='pt')['input_ids'].size()[1] < 490:/n break/n line_size -= 1/n/n print(/'>>>>> Begin Some Very Long Beam Generation <<<<</')/n print(/'Context Line Size: {}/'.format(line_size)) # actual context len = 2*line_size/n print(/'Context Before://n{}/'.format(pre_code_input))/n print(/'Context After://n{}/'.format(post_code_input))/n/n ''' 接下来,函数使用波束搜索(beam search)算法,搜索取代掩码令牌的最佳代码片段。该算法考虑多个候选补丁,并对它们进行评估,然后选择最有可能的结果。搜索过程中,算法使用预训练模型对候选补丁进行打分,并根据分数对其进行排序。 函数首先定义一个长度为1到30个令牌的for循环,该循环用于确定掩码令牌的长度。然后,函数创建一个BeamSearch类的实例,并调用其generate_beam方法来生成建议补丁列表。每个建议补丁由三部分组成:代码片段、得分以及描述该补丁的字符串(例如/'Before /' + masked_line)。 '''/n ret_before = []/n # Straight up line replacement/n for token_len in range(1, 30): # Within 10/n masked_line = /' /' + mask_token * token_len + /' /'/n #from bert_beam_search import BeamSearch/n beam_engine = BeamSearch(model, tokenizer, pre_code_input + masked_line + post_code_input, device,/n beam_width=beam_width, re_rank=re_rank)/n beam_list, masked_index = beam_engine.generate_beam()/n for beam in beam_list:/n ret_before.append((/'/'.join(beam[2]), beam[0] / token_len, /'Before /' + masked_line))/n ret_before.sort(key=lambda x: x[1], reverse=True)/n #去重/n ret_before = remove_redudant(ret_before)/n/n #函数返回插入点之前的代码、要替换的旧代码、所有生成的建议补丁列表以及插入点之后的代码。如果指定了最大建议补丁数量,则只返回前N个建议补丁。/n ret = []/n ret.extend(ret_before)/n ret.sort(key=lambda x: x[1], reverse=True)/n/n if top_n_patches == -1:/n return pre_code, old_code, ret, post_code/n else:/n return pre_code, old_code, ret[:top_n_patches], post_code/n/ndef process_file(file, line_loc, tokenizer, model, beam_width, re_rank=True, top_n_patches=-1):/n with open(file, 'r', encoding='utf-8', errors='ignore') as f:/n data = f.readlines()/n #函数首先读取指定文件的所有行,并将其存储到data变量中。然后,函数根据指定行的位置将文件内容分成三部分:插入点之前的代码(pre_code)、插入点之后的代码(post_code)以及要替换的旧代码(old_code)。/n ret = []/n mask_token = /'<mask>/'/n pre_code = data[:line_loc]/n fault_line = comment_remover(data[line_loc].strip()) # remove comments/n old_code = data[line_loc].strip()/n post_code = data[line_loc + 1:]/n/n line_size = 100/n while (1):/n pre_code_input = /'</s> /' + /' /'.join([x.strip() for x in pre_code[-line_size:]])/n post_code_input = /' /'.join([x.strip() for x in post_code[0:line_size]]).replace(/'//n/', '').strip()/n if tokenizer(pre_code_input + fault_line + post_code_input, return_tensors='pt')['input_ids'].size()[1] < 490:/n break/n line_size -= 1/n/n print(/'>>>>> Begin Some Very Long Beam Generation <<<<</')/n print(/'Context Line Size: {}/'.format(line_size)) # actual context len = 2*line_size/n print(/'Context Before://n{}/'.format(pre_code_input))/n print(/'>> {} <</'.format(fault_line))/n print(/'Context After://n{}/'.format(post_code_input))/n/n #计算一个字符串 fault_line 经过指定的 tokenizer 处理后,生成的编码序列中的 token 数量/n fault_line_token_size = tokenizer(fault_line, return_tensors='pt')[/'input_ids/'].shape[1] - 2/n/n # Straight up line replacement/n for token_len in range(fault_line_token_size - 5, fault_line_token_size + 5): # Within 10/n if token_len <= 0:/n continue/n masked_line = /' /' + mask_token * token_len + /' /'/n #bert_beam_search.py/n beam_engine = BeamSearch(model, tokenizer, pre_code_input + masked_line + post_code_input, device,/n beam_width=beam_width, re_rank=re_rank)/n beam_list, masked_index = beam_engine.generate_beam()/n for beam in beam_list:/n ret.append((/'/'.join(beam[2]), beam[0] / token_len, masked_line))/n/n templates = generate_template(fault_line)/n for partial_beginning, partial_end in templates:/n temp_size = fault_line_token_size - (/n tokenizer(partial_beginning, return_tensors='pt')[/'input_ids/'].shape[1] - 2) - (/n tokenizer(partial_end, return_tensors='pt')[/'input_ids/'].shape[1] - 2)/n for token_len in range(2, 11):/n if token_len <= 0:/n continue/n masked_line = /' /' + partial_beginning + mask_token * token_len + partial_end + /' /'/n beam_engine = BeamSearch(model, tokenizer, pre_code_input + masked_line + post_code_input, device,/n beam_width=beam_width, re_rank=re_rank)/n beam_list, masked_index = beam_engine.generate_beam()/n for beam in beam_list:/n ret.append((partial_beginning + /'/'.join(beam[2]) + partial_end, beam[0] / token_len, masked_line))/n/n match_template = generate_match_template(fault_line, tokenizer)/n for match, length in match_template:/n for token_len in range(1, length + 5):/n if len(match.split(mask_token)) == 2:/n masked_line = /' /' + match.split(mask_token)[0] + mask_token * token_len + match.split(mask_token)[/n 1] + /' /'/n beam_engine = BeamSearch(model, tokenizer, pre_code_input + masked_line + post_code_input, device,/n beam_width=beam_width, re_rank=re_rank)/n beam_list, masked_index = beam_engine.generate_beam()/n for beam in beam_list:/n ret.append((match.split(mask_token)[0] + /'/'.join(beam[2]) + match.split(mask_token)[1],/n beam[0] / token_len, masked_line))/n else:/n masked_line = /' /'/n masked_line += (mask_token * token_len).join(match.split(mask_token)) + /' /'/n beam_engine = BeamSearch(model, tokenizer, pre_code_input + masked_line + post_code_input, device,/n beam_width=beam_width, re_rank=re_rank)/n beam_list, masked_index = beam_engine.generate_beam()/n for beam in beam_list:/n index = 0/n gen_line = /'/'/n for c in masked_line.split(mask_token)[:-1]:/n gen_line += c/n gen_line += beam[2][index]/n index += 1/n gen_line += masked_line.split(mask_token)[-1]/n gen_line = gen_line[1:-1]/n ret.append((gen_line, beam[0] / (token_len * (len(match.split(mask_token)) - 1)), masked_line))/n/n simple_operator_template = match_simple_operator(fault_line, tokenizer)/n for template in simple_operator_template:/n token_len = template.count(/'<mask>/')/n masked_line = /' /' + template + /' /'/n beam_engine = BeamSearch(model, tokenizer, pre_code_input + masked_line + post_code_input, device,/n beam_width=beam_width, re_rank=re_rank)/n beam_list, masked_index = beam_engine.generate_beam()/n for beam in beam_list:/n index = 0/n gen_line = /'/'/n for c in masked_line.split(mask_token)[:-1]:/n gen_line += c/n gen_line += beam[2][index]/n index += 1/n gen_line += masked_line.split(mask_token)[-1]/n gen_line = gen_line[1:-1]/n ret.append((gen_line, beam[0] / token_len, masked_line))/n/n ret.sort(key=lambda x: x[1], reverse=True)/n ret = remove_redudant(ret)/n if top_n_patches == -1:/n return pre_code, old_code, ret, post_code/n else:/n return pre_code, old_code, ret[:top_n_patches], post_code/n/ndef main(bug_ids, output_folder, skip_validation, uniapr, beam_width, re_rank, perfect, top_n_patches):/n #如果bug_ids列表为空,则调用build_d4j1_2函数生成缺陷ID列表。/n if bug_ids[0] == 'none':/n bug_ids = build_d4j1_2()/n #加载预训练好的RoBERTa模型和tokenize/n model = RobertaForMaskedLM.from_pretrained(/'microsoft/codebert-base-mlm/').to(device)/n tokenizer = RobertaTokenizer.from_pretrained(/'microsoft/codebert-base-mlm/')/n/n for bug_id in bug_ids:/n subprocess.run('rm -rf ' + '/tmp/' + bug_id, shell=True)/n #使用defects4j checkout命令从Defects4J仓库中检出该缺陷ID对应的代码版本到/tmp/下的该缺陷ID的文件夹/n subprocess.run(/'defects4j checkout -p %s -v %s -w %s/' % (/n bug_id.split('-')[0], bug_id.split('-')[1] + 'b', ('/tmp/' + bug_id)), shell=True)/n patch_pool_folder = /'patches-pool/'/n #tool.fault_localization,得到漏洞文件路径和漏洞行号/n location = get_location(bug_id, perfect=perfect)/n # location = get_location_tbar(bug_id)/n if perfect:/n patch_pool_folder = /'pfl-patches-pool-temp/'/n/n #从Defects4J数据库中导出与给定缺陷相关的测试用例/n testmethods = os.popen('defects4j export -w %s -p tests.trigger' % ('/tmp/' + bug_id)).readlines()/n/n logger = Logger(output_folder + '/' + bug_id + /'_result.txt/')/n logger.logo(args)/n #validate_patches.py/n if uniapr:/n validator = UNIAPRpatches(bug_id, testmethods, logger, patch_pool_folder=patch_pool_folder,/n skip_validation=skip_validation)/n else:/n validator = GVpatches(bug_id, testmethods, logger, patch_pool_folder=patch_pool_folder,/n skip_validation=skip_validation)/n/n for file, line_number in location:/n print('Location: {} line # {}'.format(file, line_number))/n file = '/tmp/' + bug_id + '/' + file/n/n start_time = time.time()/n #调用上面的process_file/n if len(location) > 3 and perfect: #行数太多,无法及时处理/n pre_code, fault_line, changes, post_code = process_file(file, line_number, tokenizer, model, 15,/n re_rank, top_n_patches)/n else:/n pre_code, fault_line, changes, post_code = process_file(file, line_number, tokenizer, model, beam_width,/n re_rank, top_n_patches)/n end_time = time.time()/n validator.add_new_patch_generation(pre_code, fault_line, changes, post_code, file, line_number,/n end_time - start_time)/n/n validator.validate()/n subprocess.run('rm -rf ' + '/tmp/' + bug_id, shell=True)/n/nif __name__ == /'__main__/':/n parser = argparse.ArgumentParser()/n #一个包含要处理的缺陷ID的列表。/n parser.add_argument('--bug_id', type=str, default='none')/n #表示是否使用UNIAPR算法进行补丁生成。/n parser.add_argument('--uniapr', action='store_true', default=False)/n #生成结果的输出文件夹路径。/n parser.add_argument('--output_folder', type=str, default='codebert_result')/n #一个布尔值,表示是否跳过验证步骤/n parser.add_argument('--skip_v', action='store_true', default=False)/n #一个布尔值,表示是否使用reranking算法对生成的补丁进行排序。/n parser.add_argument('--re_rank', action='store_true', default=False)/n #一个整数,表示beam search算法中的beam宽度。/n parser.add_argument('--beam_width', type=int, default=25)/n #一个布尔值,表示是否使用完美修复位置算法。/n parser.add_argument('--perfect', action='store_true', default=False)/n #一个整数,表示生成的补丁数目。/n parser.add_argument('--top_n_patches', type=int, default=-1)/n args = parser.parse_args()/n print(/'Run with setting:/')/n print(args)/n main([args.bug_id], args.output_folder, args.skip_v, args.uniapr, args.beam_width,/n args.re_rank, args.perfect, args.top_n_patches)/n/n #python3 experiment.py 什么意思内容:该脚本是用来自动化地生成缺陷修复补丁的工具。它接收一个包含要处理/n/nThis script utilizes the CodeBERT model for automatically generating patch suggestions for Java defects. It analyzes faulty code lines, employs beam search to explore potential fixes, and evaluates the generated patches based on their likelihood of resolving the issue./n/nThe script can be run with various command-line arguments to customize the patch generation process, including:/n/n* --bug_id: A list of bug IDs to process./n* --uniapr: Whether to use the UNIAPR algorithm for patch generation./n* --output_folder: The output folder for generated results./n* --skip_v: Whether to skip the validation step./n* --re_rank: Whether to use reranking to sort generated patches./n* --beam_width: The beam width for the beam search algorithm./n* --perfect: Whether to use the perfect fix location algorithm./n* --top_n_patches: The number of patches to generate./n/nThe script uses the CodeBERT model to analyze the context of the faulty code line and generate potential code replacements. It also incorporates various strategies to enhance the quality of the generated patches, such as removing redundant patches and incorporating templates based on common code patterns./n/nThe generated patches are then validated against the original defect and the associated test cases to evaluate their effectiveness in resolving the issue. The script provides a detailed log of the patch generation and validation process for each bug ID./n/nThis script can be a valuable tool for developers and researchers involved in automated defect repair, enabling them to quickly explore and evaluate potential patch suggestions for Java defects.
原文地址: https://www.cveoy.top/t/topic/m6y3 著作权归作者所有。请勿转载和采集!