-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathdivide_function_name.py
More file actions
161 lines (131 loc) · 4.41 KB
/
divide_function_name.py
File metadata and controls
161 lines (131 loc) · 4.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import os
import json
import argparse
import sentencepiece as spm
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
sp = spm.SentencePieceProcessor()
sp.load('evaluation/segmentation_model/segmentation.model')
lem = WordNetLemmatizer()
def get_pos(treebank_tag):
"""
get the pos of a treebank tag
"""
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return None # for easy if-statement
def func_name_segmentation(word):
"""
Segment concatenated words into individual words
"""
res = sp.encode_as_pieces(word)
res[0] = res[0][1:]
return res
def func_name_preprocessing(func_name):
"""
Preprocess function name by:
- tokenize whole name into words
- remove digits
- segment concatenated words
- lemmatize words
"""
func_name = func_name.lower()
# split whole name into words and remove digits
func_name = func_name.replace('_', ' ')
tmp = ''
for c in func_name:
if not c.isalpha(): # filter out numbers and other special characters, e.g. '_' and digits
tmp = tmp + ' '
elif c.isupper():
tmp = tmp + ' ' + c
else:
tmp = tmp + c
tmp = tmp.strip()
tmp = tmp.split(' ')
res = []
i = 0
while i < len(tmp):
cap = ''
t = tmp[i]
# handle series of capital letters: e.g., SHA, MD
while i < len(tmp) and len(tmp[i]) == 1:
cap = cap + tmp[i]
i += 1
if len(cap) == 0:
res.append(t)
i += 1
else:
res.append(cap)
# lemmatize words
words = []
for word in res:
if not isinstance(word, str) or word == '':
continue
words.append(word)
tokens = nltk.pos_tag(words)
res = []
for word, tag in tokens:
wntag = get_pos(tag)
if wntag is None: # not supply tag in case of None
word = lem.lemmatize(word)
else:
word = lem.lemmatize(word, pos=wntag)
res.append(word)
# segment concatenated words
final_words = []
for word in res:
if not isinstance(word, str) or word == '':
continue
splited = func_name_segmentation(word)
for w in splited:
if not isinstance(w, str) or w == '':
continue
final_words.append(w)
if len(final_words) == 0:
return None
resulting_name = ' '.join(final_words)
return resulting_name.lower()
def main(args):
input_file = args.input_file
output_dir = args.output_dir
if not os.path.exists(output_dir):
os.makedirs(output_dir, exist_ok=True)
with open(input_file, 'r') as f:
results = json.load(f)
for result in results:
ground_truth = func_name_preprocessing(result['ground_truth'].split(' ')[-1].strip())
if ground_truth is None:
continue
prediction = result['predicted_name'].split('</s>')[0]
if prediction.find('\"') != -1:
prediction = prediction.split('\"')[1]
if prediction.find('`') != -1:
prediction = prediction.split('`')[1]
prediction = prediction.split('.')[0].strip()
prediction = prediction.split(' ')[-1].strip()
prediction = func_name_preprocessing(prediction)
if prediction is None:
prediction = ' '
file = open(os.path.join(output_dir, 'processed_predicted_function_name.txt'), 'a')
file.write(ground_truth + ', ' + prediction + ',\n')
file.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Preprocess the predicted function name for evaluation.')
parser.add_argument('-i', '--input_file', type=str, required=True,
# default='',
help='Path to the input file containing predicted function names and ground truth.')
parser.add_argument('-o', '--output_dir', type=str, required=True,
# default='',
help='Directory to save the evaluation results.')
args = parser.parse_args()
main(args)