Instructions to use approach0/dpr-cotbert-320 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use approach0/dpr-cotbert-320 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("fill-mask", model="approach0/dpr-cotbert-320")# Load model directly from transformers import AutoTokenizer, DprEncoder tokenizer = AutoTokenizer.from_pretrained("approach0/dpr-cotbert-320") model = DprEncoder.from_pretrained("approach0/dpr-cotbert-320") - Notebooks
- Google Colab
- Kaggle
| import re | |
| import os | |
| import fire | |
| import torch | |
| from functools import partial | |
| from transformers import AutoTokenizer | |
| from transformers import AutoModelForPreTraining | |
| from pya0.preprocess import preprocess_for_transformer | |
| def highlight_masked(txt): | |
| return re.sub(r"(\[MASK\])", '\033[92m' + r"\1" + '\033[0m', txt) | |
| def classifier_hook(tokenizer, tokens, topk, module, inputs, outputs): | |
| unmask_scores, seq_rel_scores = outputs | |
| MSK_CODE = 103 | |
| token_ids = tokens['input_ids'][0] | |
| masked_idx = (token_ids == torch.tensor([MSK_CODE])) | |
| scores = unmask_scores[0][masked_idx] | |
| cands = torch.argsort(scores, dim=1, descending=True) | |
| for i, mask_cands in enumerate(cands): | |
| top_cands = mask_cands[:topk].detach().cpu() | |
| print(f'MASK[{i}] top candidates: ' + | |
| str(tokenizer.convert_ids_to_tokens(top_cands))) | |
| def test(tokenizer_name_or_path, model_name_or_path, test_file='test.txt'): | |
| tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path) | |
| model = AutoModelForPreTraining.from_pretrained(model_name_or_path, | |
| tie_word_embeddings=True | |
| ) | |
| with open(test_file, 'r') as fh: | |
| for line in fh: | |
| # parse test file line | |
| line = line.rstrip() | |
| fields = line.split('\t') | |
| maskpos = list(map(int, fields[0].split(','))) | |
| # preprocess and mask words | |
| sentence = preprocess_for_transformer(fields[1]) | |
| tokens = sentence.split() | |
| for pos in filter(lambda x: x!=0, maskpos): | |
| tokens[pos-1] = '[MASK]' | |
| sentence = ' '.join(tokens) | |
| sentence = sentence.replace('[mask]', '[MASK]') | |
| tokens = tokenizer(sentence, | |
| padding=True, truncation=True, return_tensors="pt") | |
| #print(tokenizer.decode(tokens['input_ids'][0])) | |
| print('*', highlight_masked(sentence)) | |
| # print unmasked | |
| with torch.no_grad(): | |
| display = ['\n', ''] | |
| classifier = model.cls | |
| partial_hook = partial(classifier_hook, tokenizer, tokens, 3) | |
| hook = classifier.register_forward_hook(partial_hook) | |
| model(**tokens) | |
| hook.remove() | |
| if __name__ == '__main__': | |
| os.environ["PAGER"] = 'cat' | |
| fire.Fire(test) | |