| | import re |
| |
|
| | from datasets import load_dataset |
| | from deepmultilingualpunctuation import PunctuationModel |
| | from multiprocess import set_start_method |
| |
|
| | from nltk.tokenize import word_tokenize, sent_tokenize |
| | from nltk.tag import pos_tag |
| |
|
| | import nltk |
| | import spacy |
| |
|
| | |
| |
|
| | |
| |
|
| | model = PunctuationModel() |
| |
|
| |
|
| | ds = load_dataset("ylacombe/mls-eng-tags", split = "train", num_proc=16) |
| |
|
| | def truecasing_by_pos(input_text): |
| | |
| | |
| | sent_texts = sent_tokenize(input_text) |
| | |
| | full_text = "" |
| |
|
| | for sent_text in sent_texts: |
| | |
| | words = word_tokenize(sent_text) |
| |
|
| | |
| | tagged_words = pos_tag([word.lower() for word in words]) |
| | |
| | |
| | capitalized_words = [w.capitalize() if t in ["NNP","NNPS"] else w for (w,t) in tagged_words] |
| | |
| | |
| | capitalized_words[0] = capitalized_words[0].capitalize() |
| | |
| | |
| | text_truecase = " ".join(capitalized_words) |
| |
|
| | full_text += text_truecase.strip() |
| |
|
| | return full_text.strip() |
| |
|
| | def true_case(text): |
| | |
| | sentences = nltk.sent_tokenize(text) |
| |
|
| | |
| | true_cased_sentences = [] |
| | for sentence in sentences: |
| | |
| | tokens = nltk.word_tokenize(sentence) |
| |
|
| | |
| | tagged = nltk.pos_tag(tokens) |
| |
|
| | |
| | for i, (word, tag) in enumerate(tagged): |
| | if i == 0 or tag in ('NNP', 'NNPS'): |
| | tagged[i] = (word.capitalize(), tag) |
| |
|
| | |
| | true_cased_sentence = ' '.join(word for word, tag in tagged) |
| |
|
| | |
| | true_cased_sentence = re.sub(r'(\w) (\W)', r'\1\2', true_cased_sentence) |
| |
|
| | true_cased_sentences.append(true_cased_sentence) |
| |
|
| | |
| | true_cased_text = ' '.join(true_cased_sentences) |
| |
|
| | return true_cased_text |
| |
|
| | spacy.require_gpu(gpu_id=2) |
| |
|
| | |
| | nlp = spacy.load('en_core_web_trf') |
| |
|
| | from spacy.util import compile_infix_regex |
| |
|
| | def custom_tokenizer(nlp): |
| | infixes = nlp.Defaults.infixes + ['\w+(?:-\w+)+'] |
| | infix_regex = compile_infix_regex(infixes) |
| | return spacy.tokenizer.Tokenizer(nlp.vocab, infix_finditer=infix_regex.finditer) |
| |
|
| | |
| | nlp.tokenizer = custom_tokenizer(nlp) |
| |
|
| | def true_case_spacy(text): |
| | |
| | doc = nlp(text) |
| | |
| | |
| | true_cased_sentences = [] |
| | |
| | |
| | for sent in doc.sents: |
| | |
| | processed_tokens = [] |
| | |
| | |
| | for i, token in enumerate(sent): |
| | |
| | if i == 0 or token.pos_ == 'PROPN': |
| | processed_tokens.append(token.text.capitalize()) |
| | else: |
| | processed_tokens.append(token.text) |
| | |
| | |
| | processed_sentence = ' '.join(processed_tokens) |
| | |
| | |
| | processed_sentence = re.sub(r'(\w) (\W)', r'\1\2', processed_sentence) |
| | |
| | |
| | true_cased_sentences.append(processed_sentence) |
| | |
| | |
| | true_cased_text = ' '.join(true_cased_sentences) |
| | |
| | return true_cased_text |
| |
|
| |
|
| | def repunctuation_apply_simple(batch): |
| | |
| | repunct_sample = model.restore_punctuation(batch["text"]) |
| | batch["repunct_text"] = true_case_spacy(repunct_sample) |
| |
|
| | return batch |
| |
|
| | if __name__ == "__main__": |
| | set_start_method("spawn") |
| | repunct_ds = ds.map(repunctuation_apply_simple, batch_size=1, num_proc=14) |
| | repunct_ds.push_to_hub("reach-vb/mls-eng-tags-spacy-v2", split = "train") |
| |
|