| | |
| | |
| |
|
| | |
| |
|
| |
|
| | import pandas as pd |
| | import numpy as np |
| | import torch |
| | from torch import nn |
| | from torch.nn import init, MarginRankingLoss |
| | from torch.optim import Adam |
| | from distutils.version import LooseVersion |
| | from torch.utils.data import Dataset, DataLoader |
| | from torch.autograd import Variable |
| | import math |
| | from transformers import AutoConfig, AutoModel, AutoTokenizer |
| | import nltk |
| | import re |
| | import torch.optim as optim |
| | from tqdm import tqdm |
| | from transformers import AutoModelForMaskedLM |
| | import torch.nn.functional as F |
| | import random |
| |
|
| |
|
| | |
| |
|
| |
|
| | maskis = [] |
| | n_y = [] |
| | class MyDataset(Dataset): |
| | def __init__(self,file_name): |
| | global maskis |
| | global n_y |
| | df = pd.read_csv(file_name) |
| | df = df.fillna("") |
| | self.inp_dicts = [] |
| | for r in range(df.shape[0]): |
| | X_init = df['X'][r] |
| | y = df['y'][r] |
| | n_y.append(y) |
| | nl = re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))|[a-z]+|\d+', y) |
| | lb = ' '.join(nl).lower() |
| | x = tokenizer.tokenize(lb) |
| | num_sub_tokens_label = len(x) |
| | X_init = X_init.replace("[MASK]", " ".join([tokenizer.mask_token] * num_sub_tokens_label)) |
| | tokens = tokenizer.encode_plus(X_init, add_special_tokens=False,return_tensors='pt') |
| | input_id_chunki = tokens['input_ids'][0].split(510) |
| | input_id_chunks = [] |
| | mask_chunks = [] |
| | mask_chunki = tokens['attention_mask'][0].split(510) |
| | for tensor in input_id_chunki: |
| | input_id_chunks.append(tensor) |
| | for tensor in mask_chunki: |
| | mask_chunks.append(tensor) |
| | xi = torch.full((1,), fill_value=101) |
| | yi = torch.full((1,), fill_value=1) |
| | zi = torch.full((1,), fill_value=102) |
| | for r in range(len(input_id_chunks)): |
| | input_id_chunks[r] = torch.cat([xi, input_id_chunks[r]],dim = -1) |
| | input_id_chunks[r] = torch.cat([input_id_chunks[r],zi],dim=-1) |
| | mask_chunks[r] = torch.cat([yi, mask_chunks[r]],dim=-1) |
| | mask_chunks[r] = torch.cat([mask_chunks[r],yi],dim=-1) |
| | di = torch.full((1,), fill_value=0) |
| | for i in range(len(input_id_chunks)): |
| | pad_len = 512 - input_id_chunks[i].shape[0] |
| | if pad_len > 0: |
| | for p in range(pad_len): |
| | input_id_chunks[i] = torch.cat([input_id_chunks[i],di],dim=-1) |
| | mask_chunks[i] = torch.cat([mask_chunks[i],di],dim=-1) |
| | vb = torch.ones_like(input_id_chunks[0]) |
| | fg = torch.zeros_like(input_id_chunks[0]) |
| | maski = [] |
| | for l in range(len(input_id_chunks)): |
| | masked_pos = [] |
| | for i in range(len(input_id_chunks[l])): |
| | if input_id_chunks[l][i] == tokenizer.mask_token_id: |
| | if i != 0 and input_id_chunks[l][i-1] == tokenizer.mask_token_id: |
| | continue |
| | masked_pos.append(i) |
| | maski.append(masked_pos) |
| | maskis.append(maski) |
| | while (len(input_id_chunks)<250): |
| | input_id_chunks.append(vb) |
| | mask_chunks.append(fg) |
| | input_ids = torch.stack(input_id_chunks) |
| | attention_mask = torch.stack(mask_chunks) |
| | input_dict = { |
| | 'input_ids': input_ids.long(), |
| | 'attention_mask': attention_mask.int() |
| | } |
| | self.inp_dicts.append(input_dict) |
| | del input_dict |
| | del input_ids |
| | del attention_mask |
| | del maski |
| | del mask_chunks |
| | del input_id_chunks |
| | del di |
| | del fg |
| | del vb |
| | del mask_chunki |
| | del input_id_chunki |
| | del X_init |
| | del y |
| | del tokens |
| | del x |
| | del lb |
| | del nl |
| | del df |
| | def __len__(self): |
| | return len(self.inp_dicts) |
| | def __getitem__(self,idx): |
| | return self.inp_dicts[idx] |
| |
|
| |
|
| | |
| |
|
| |
|
| | tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base") |
| | model = AutoModelForMaskedLM.from_pretrained("microsoft/graphcodebert-base") |
| | base_model = AutoModelForMaskedLM.from_pretrained("microsoft/graphcodebert-base") |
| | model.load_state_dict(torch.load('var_runs/model_26_2')) |
| | model.eval() |
| | base_model.eval() |
| | myDs=MyDataset('test.csv') |
| | train_loader=DataLoader(myDs,batch_size=1,shuffle=False) |
| |
|
| |
|
| | |
| |
|
| |
|
| | variable_names = [ |
| | |
| | 'count', 'value', 'result', 'flag', 'max', 'min', 'data', 'input', 'output', 'name', 'index', 'status', 'error', 'message', 'price', 'quantity', 'total', 'length', 'size', 'score', |
| |
|
| | |
| | 'studentName', 'accountBalance', 'isFound', 'maxScore', 'userAge', 'carModel', 'bookTitle', 'arrayLength', 'employeeID', 'itemPrice', 'customerAddress', 'productCategory', 'orderNumber', 'transactionType', 'bankAccount', 'shippingMethod', 'deliveryDate', 'purchaseAmount', 'inventoryItem', 'salesRevenue', |
| |
|
| | |
| | 'numberOfStudents', 'averageTemperature', 'userIsLoggedIn', 'totalSalesAmount', 'employeeSalaryRate', 'maxAllowedAttempts', 'selectedOption', 'shippingAddress', 'manufacturingDate', 'connectionPool', 'customerAccountBalance', 'employeeSalaryReport', 'productInventoryCount', 'transactionProcessingStatus', 'userAuthenticationToken', 'orderShippingAddress', 'databaseConnectionPoolSize', 'vehicleEngineTemperature', 'sensorDataProcessingRate', 'employeePayrollSystem', |
| |
|
| | |
| | 'customerAccountBalanceValue', 'employeeSalaryReportData', 'productInventoryItemCount', 'transactionProcessingStatusFlag', 'userAuthenticationTokenKey', 'orderShippingAddressDetails', 'databaseConnectionPoolMaxSize', 'vehicleEngineTemperatureReading', 'sensorDataProcessingRateLimit', 'employeePayrollSystemData', 'customerOrderShippingAddress', 'productCatalogItemNumber', 'transactionProcessingSuccessFlag', 'userAuthenticationAccessToken', 'databaseConnectionPoolConfig', 'vehicleEngineTemperatureSensor', 'sensorDataProcessingRateLimitation', 'employeePayrollSystemConfiguration', 'customerAccountBalanceHistoryData', 'transactionProcessingStatusTracking' |
| | ] |
| | var_list = [] |
| | for j in range(6): |
| | d =[] |
| | var_list.append(d) |
| | for var in variable_names: |
| | try: |
| | var_list[len(tokenizer.tokenize(var))-1].append(var) |
| | except: |
| | continue |
| |
|
| |
|
| | |
| |
|
| |
|
| | tot_pll = 0.0 |
| | base_tot_pll = 0.0 |
| | loop = tqdm(train_loader, leave=True) |
| | cntr = 0 |
| | for batch in loop: |
| | maxi = torch.tensor(0.0, requires_grad=True) |
| | for i in range(len(batch['input_ids'])): |
| | cntr+=1 |
| | maski = maskis[cntr-1] |
| | li = len(maski) |
| | input_ids = batch['input_ids'][i][:li] |
| | att_mask = batch['attention_mask'][i][:li] |
| | y = n_y[cntr-1] |
| | ty = tokenizer.encode(y)[1:-1] |
| | num_sub_tokens_label = len(ty) |
| | if num_sub_tokens_label > 6: |
| | continue |
| | print("Ground truth:", y) |
| | m_y = random.choice(var_list[num_sub_tokens_label-1]) |
| | m_ty = tokenizer.encode(m_y)[1:-1] |
| | print("Mock truth:", m_y) |
| | |
| | outputs = model(input_ids, attention_mask = att_mask) |
| | base_outputs = base_model(input_ids, attention_mask = att_mask) |
| | last_hidden_state = outputs[0].squeeze() |
| | base_last_hidden_state = base_outputs[0].squeeze() |
| | l_o_l_sa = [] |
| | base_l_o_l_sa = [] |
| | sum_state = [] |
| | base_sum_state = [] |
| | for t in range(num_sub_tokens_label): |
| | c = [] |
| | d = [] |
| | l_o_l_sa.append(c) |
| | base_l_o_l_sa.append(d) |
| | if len(maski) == 1: |
| | masked_pos = maski[0] |
| | for k in masked_pos: |
| | for t in range(num_sub_tokens_label): |
| | l_o_l_sa[t].append(last_hidden_state[k+t]) |
| | base_l_o_l_sa[t].append(base_last_hidden_state[k+t]) |
| | else: |
| | for p in range(len(maski)): |
| | masked_pos = maski[p] |
| | for k in masked_pos: |
| | for t in range(num_sub_tokens_label): |
| | if (k+t) >= len(last_hidden_state[p]): |
| | l_o_l_sa[t].append(last_hidden_state[p+1][k+t-len(last_hidden_state[p])]) |
| | base_l_o_l_sa[t].append(base_last_hidden_state[p+1][k+t-len(base_last_hidden_state[p])]) |
| | continue |
| | l_o_l_sa[t].append(last_hidden_state[p][k+t]) |
| | base_l_o_l_sa[t].append(base_last_hidden_state[p][k+t]) |
| | for t in range(num_sub_tokens_label): |
| | sum_state.append(l_o_l_sa[t][0]) |
| | base_sum_state.append(base_l_o_l_sa[t][0]) |
| | for i in range(len(l_o_l_sa[0])): |
| | if i == 0: |
| | continue |
| | for t in range(num_sub_tokens_label): |
| | sum_state[t] = sum_state[t] + l_o_l_sa[t][i] |
| | base_sum_state[t] = base_sum_state[t] + base_l_o_l_sa[t][i] |
| | yip = len(l_o_l_sa[0]) |
| | val = 0.0 |
| | m_val = 0.0 |
| | m_base_val = 0.0 |
| | base_val = 0.0 |
| | for t in range(num_sub_tokens_label): |
| | sum_state[t] /= yip |
| | base_sum_state[t] /= yip |
| | probs = F.softmax(sum_state[t], dim=0) |
| | base_probs = F.softmax(base_sum_state[t], dim=0) |
| | val = val - torch.log(probs[ty[t]]) |
| | m_val = m_val - torch.log(probs[m_ty[t]]) |
| | base_val = base_val - torch.log(base_probs[ty[t]]) |
| | m_base_val = m_base_val - torch.log(base_probs[m_ty[t]]) |
| | val = val / num_sub_tokens_label |
| | base_val = base_val / num_sub_tokens_label |
| | m_val = m_val / num_sub_tokens_label |
| | m_base_val = m_base_val / num_sub_tokens_label |
| | print("Sent PLL:") |
| | print(val) |
| | print("Base Sent PLL:") |
| | print(base_val) |
| | print("Net % difference:") |
| | diff = (val-base_val)*100/base_val |
| | print(diff) |
| | tot_pll += val |
| | base_tot_pll+=base_val |
| | print() |
| | print() |
| | print("Mock Sent PLL:") |
| | print(m_val) |
| | print("Mock Base Sent PLL:") |
| | print(m_base_val) |
| | print("Mock Net % difference:") |
| | m_diff = (m_val-m_base_val)*100/m_base_val |
| | print(m_diff) |
| | for c in sum_state: |
| | del c |
| | for d in base_sum_state: |
| | del d |
| | del sum_state |
| | del base_sum_state |
| | for c in l_o_l_sa: |
| | del c |
| | for c in base_l_o_l_sa: |
| | del c |
| | del l_o_l_sa |
| | del base_l_o_l_sa |
| | del maski |
| | del input_ids |
| | del att_mask |
| | del last_hidden_state |
| | del base_last_hidden_state |
| | print("Tot PLL: ", tot_pll) |
| | print("Base Tot PLL: ", base_tot_pll) |
| |
|
| |
|
| | |
| |
|
| |
|
| |
|
| |
|
| |
|