| """ |
| Dataset creation tools. |
| |
| Keep to-level imports clean of non-trivial imports for specific tools, |
| because this file is imported for various purposes |
| """ |
|
|
| import ast |
| import concurrent.futures |
| import contextlib |
| import hashlib |
| import json |
| import os |
| import shutil |
| import signal |
| import sys |
| import traceback |
| from concurrent.futures import ProcessPoolExecutor |
|
|
| import psutil |
| import pytest |
| import pandas as pd |
| import numpy as np |
| from tqdm import tqdm |
|
|
| from utils import flatten_list, remove |
|
|
|
|
| def parse_rst_file(filepath): |
| with open(filepath, 'r') as f: |
| input_data = f.read() |
| settings_overrides = {'initial_header_level': 2} |
| from docutils import core |
| document = core.publish_doctree( |
| source=input_data, |
| source_path=filepath, |
| settings_overrides=settings_overrides, |
| ) |
| qa_pairs = [] |
| current_section = None |
| current_question = "" |
| current_answer = "" |
| for node in document.traverse(): |
| if node.__class__.__name__ == 'section': |
| current_section = "" |
| elif current_section is not None: |
| if node.__class__.__name__ == 'Text': |
| if node.astext()[-1] == "?": |
| if current_question: |
| qa_pairs.append((current_question, current_answer)) |
| current_question = node.astext() |
| current_answer = "" |
| else: |
| current_answer += node.astext() |
| if current_answer: |
| qa_pairs.append((current_question, current_answer)) |
| return {k: v for k, v in qa_pairs} |
|
|
|
|
| def test_scrape_dai_docs(): |
| home = os.path.expanduser('~') |
| file = os.path.join(home, 'h2oai/docs/faq.rst') |
| qa_pairs = parse_rst_file(file) |
| prompt_type = 'human_bot' |
| from prompter import prompt_types |
| assert prompt_type in prompt_types |
| save_thing = [{"instruction": k, "output": v, 'prompt_type': prompt_type} for k, v in qa_pairs.items()] |
| output_file = "dai_faq.json" |
| with open(output_file, "wt") as f: |
| f.write(json.dumps(save_thing, indent=2)) |
|
|
|
|
| def test_scrape_dai_docs_all(): |
| """ |
| pytest create_data.py::test_scrape_dai_docs_all |
| """ |
| import glob |
| import nltk |
| nltk.download('punkt') |
| dd = {} |
| np.random.seed(1234) |
| home = os.path.expanduser('~') |
| files = list(glob.glob(os.path.join(home, "h2oai/docs/**/*rst"))) |
| np.random.shuffle(files) |
| val_count = int(0.05 * len(files)) |
| train_files = files[val_count:] |
| valid_files = files[:val_count] |
| things = [ |
| ("dai_docs.train.json", train_files), |
| ("dai_docs.valid.json", valid_files) |
| ] |
| for LEN in [100, 200, 500]: |
| for output_file, ff in things: |
| if output_file not in dd: |
| dd[output_file] = [] |
| for f in ff: |
| with open(f) as input: |
| blob = input.read() |
| blob = blob.replace("~~", "") |
| blob = blob.replace("==", "") |
| blob = blob.replace("''", "") |
| blob = blob.replace("--", "") |
| blob = blob.replace("**", "") |
| dd[output_file].extend(get_sentences(blob, length=LEN)) |
| for output_file, _ in things: |
| save_thing = [{"output": k.strip(), 'prompt_type': 'plain'} for k in dd[output_file]] |
| with open(output_file, "wt") as f: |
| f.write(json.dumps(save_thing, indent=2)) |
|
|
|
|
| def get_sentences(blob, length): |
| """ |
| break-up input text into sentences and then output list of sentences of about length in size |
| :param blob: |
| :param length: |
| :return: |
| """ |
| import nltk |
| nltk.download('punkt') |
| from nltk.tokenize import sent_tokenize |
| sentences = sent_tokenize(blob) |
| my_sentences = [] |
| my_string = "" |
| for sentence in sentences: |
| if len(my_string) + len(sentence) <= length: |
| if my_string: |
| my_string += " " + sentence |
| else: |
| my_string = sentence |
| else: |
| my_sentences.append(my_string) |
| my_string = "" |
| return my_sentences or [my_string] |
|
|
|
|
| def setup_dai_docs(path=None, dst="working_dir_docs", from_hf=False): |
| """ |
| Only supported if have access to source code or HF token for HF spaces and from_hf=True |
| :param path: |
| :param dst: |
| :param from_hf: |
| :return: |
| """ |
|
|
| home = os.path.expanduser('~') |
|
|
| if from_hf: |
| |
| from huggingface_hub import hf_hub_download |
| |
| token = os.getenv('HUGGING_FACE_HUB_TOKEN', True) |
| path_to_zip_file = hf_hub_download('h2oai/dai_docs', 'dai_docs.zip', token=token, repo_type='dataset') |
| path = 'h2oai' |
| import zipfile |
| with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref: |
| zip_ref.extractall(path) |
| path = os.path.join(path, 'docs/**/*') |
|
|
| if path is None: |
| if os.path.isdir(os.path.join(home, 'h2oai')): |
| path = os.path.join(home, "h2oai/docs/**/*") |
| else: |
| assert os.path.isdir(os.path.join(home, 'h2oai.superclean')), '%s does not exist' % path |
| path = os.path.join(home, "h2oai.superclean/docs/**/*") |
| import glob |
| files = list(glob.glob(path, recursive=True)) |
|
|
| |
|
|
| remove(dst) |
| os.makedirs(dst) |
|
|
| |
| for fil in files: |
| if os.path.isfile(fil): |
| shutil.copy(fil, dst) |
|
|
| |
| scorers_dir = os.path.join(dst, 'scorers') |
| makedirs(scorers_dir) |
| for fil in glob.glob(os.path.join(dst, '*.frag')): |
| shutil.copy(fil, scorers_dir) |
|
|
| return dst |
|
|
|
|
| def rst_to_outputs(files, min_len=30, max_len=2048 // 2 - 30): |
| |
|
|
| |
| import pypandoc |
| basedir = os.path.abspath(os.getcwd()) |
|
|
| outputs = [] |
| for fil in files: |
| os.chdir(basedir) |
| os.chdir(os.path.dirname(fil)) |
| fil = os.path.basename(fil) |
| print("Processing %s" % fil, flush=True) |
| |
| |
| |
| |
| |
| |
| |
| |
| out_format = 'plain' |
| |
| extra_args = ['--wrap=preserve', '--resource path="%s" % dst'] |
|
|
| plain_list = [] |
| try: |
| |
| input_rst = pypandoc.convert_file(fil, 'rst') |
| input_list = input_rst.split('\n``') |
| for input_subrst in input_list: |
| input_plain = pypandoc.convert_text(input_subrst, format='rst', to='plain') |
| plain_list.append([input_plain, fil]) |
| except Exception as e: |
| print("file exception: %s %s" % (fil, str(e)), flush=True) |
|
|
| if not plain_list: |
| |
| output = pypandoc.convert_file(fil, out_format, extra_args=extra_args, format='rst') |
| outputs1 = get_sentences(output, length=max_len) |
| for oi, output in enumerate(outputs1): |
| output = output.replace('\n\n', '\n') |
| plain_list.append([output, fil]) |
| outputs.extend(plain_list) |
|
|
| |
| |
|
|
| |
| new_outputs = [] |
| num_truncated = 0 |
| num_orig = len(outputs) |
| for output, fil in outputs: |
| if len(output) < max_len: |
| new_outputs.append([output, fil]) |
| continue |
| outputs1 = get_sentences(output, length=max_len) |
| for oi, output1 in enumerate(outputs1): |
| output1 = output1.replace('\n\n', '\n') |
| new_outputs.append([output1, fil]) |
| num_truncated += 1 |
| print('num_orig: %s num_truncated: %s' % (num_orig, num_truncated), flush=True) |
|
|
| new_outputs = [[k.strip(), fil] for k, fil in new_outputs if len(k.strip()) > min_len] |
|
|
| return new_outputs |
|
|
|
|
| def test_scrape_dai_docs_all_pandoc(): |
| """ |
| pytest -s -v create_data.py::test_scrape_dai_docs_all_pandoc |
| :return: |
| """ |
|
|
| dst = setup_dai_docs() |
|
|
| import glob |
| files = list(glob.glob(os.path.join(dst, '*rst'), recursive=True)) |
|
|
| basedir = os.path.abspath(os.getcwd()) |
| new_outputs = rst_to_outputs(files) |
| os.chdir(basedir) |
|
|
| remove(dst) |
| save_thing = [{"output": k.strip(), 'prompt_type': 'plain'} for k in new_outputs] |
| output_file = "dai_docs.train_cleaned.json" |
| with open(output_file, "wt") as f: |
| f.write(json.dumps(save_thing, indent=2)) |
|
|
|
|
| def test_config_to_json(): |
| """ |
| Needs to run from Driverless AI source directory. |
| E.g. (base) jon@gpu:~/h2oai$ pytest -s -v /data/jon/h2ogpt/create_data.py::test_config_to_json ; cp config.json /data/jon/h2ogpt/ |
| :return: |
| """ |
| try: |
| |
| import json |
| from h2oaicore.systemutils import config |
| toml_list = [] |
| for k, v in config.get_meta_dict().items(): |
| title = (v.title + ": ") if v.title else '' |
| comment = v.comment or '' |
| if not (title or comment): |
| continue |
| toml_list.extend( |
| [ |
| { |
| 'prompt_type': 'plain', |
| 'instruction': f"<human>: What does {k} do?\n<bot>: {k.replace('_', ' ')} config.toml: {comment or title}\n<human>:".replace( |
| "\n", ""), |
| }, |
| { |
| 'prompt_type': 'plain', |
| 'instruction': f"<human>: Explain {k}.\n<bot>: {k.replace('_', ' ')} config.toml: {comment or title}\n<human>:".replace( |
| "\n", ""), |
| }, |
| { |
| 'prompt_type': 'plain', |
| 'instruction': f"<human>: How can I do this: {title}.\n<bot>: Set the {k.replace('_', ' ')} config.toml\n<human>:".replace( |
| "\n", ""), |
| } if title and comment else None, |
| { |
| 'prompt_type': 'human_bot', |
| 'instruction': f'Explain the following expert setting for Driverless AI', |
| 'input': f"{k}", |
| 'output': f"{k.replace('_', ' ')} config.toml: {comment or title}".replace("\n", ""), |
| }, |
| { |
| 'prompt_type': 'human_bot', |
| 'instruction': f'Explain the following expert setting for Driverless AI', |
| 'input': f"{k}", |
| 'output': f"{k.replace('_', ' ')} config.toml: {title}{comment}".replace("\n", ""), |
| }, |
| { |
| 'prompt_type': 'human_bot', |
| 'instruction': f'Explain the following expert setting for Driverless AI', |
| 'input': f"{k.replace('_', ' ')}", |
| 'output': f"{k.replace('_', ' ')} config.toml: {title}{comment}".replace("\n", ""), |
| }, |
| { |
| 'prompt_type': 'human_bot', |
| 'instruction': f'Explain the following expert setting for Driverless AI', |
| 'input': f"{title}", |
| 'output': f"{k.replace('_', ' ')} config.toml: {title}{comment}".replace("\n", ""), |
| }, |
| { |
| 'prompt_type': 'human_bot', |
| 'instruction': f'Provide a short explanation of the expert setting {k}', |
| 'output': f"{k.replace('_', ' ')} config.toml: {comment or title}".replace("\n", ""), |
| }, |
| { |
| 'prompt_type': 'human_bot', |
| 'instruction': f'Provide a detailed explanation of the expert setting {k}', |
| 'output': f"{k.replace('_', ' ')} config.toml: {title}{comment}".replace("\n", ""), |
| }, |
| ] |
| ) |
| toml_list = [x for x in toml_list if x] |
| with open("config.json", "wt") as f: |
| f.write(json.dumps(toml_list, indent=2)) |
| except Exception as e: |
| print("Exception: %s" % str(e), flush=True) |
|
|
|
|
| def copy_tree(src, dst, follow_symlink=False): |
| makedirs(dst, exist_ok=True) |
| for (path, dirs, files) in os.walk(src, followlinks=follow_symlink): |
| new_path = path.replace(src, dst) |
| makedirs(new_path, exist_ok=True) |
| for file in files: |
| filename = os.path.join(path, file) |
| new_filename = os.path.join(new_path, file) |
| |
| try: |
| atomic_copy(filename, new_filename) |
| except FileNotFoundError: |
| pass |
|
|
|
|
| def atomic_move(src, dst): |
| try: |
| shutil.move(src, dst) |
| except (shutil.Error, FileExistsError): |
| pass |
| remove(src) |
|
|
|
|
| def atomic_copy(src=None, dst=None, with_permissions=True): |
| if os.path.isfile(dst): |
| return |
| import uuid |
| my_uuid = uuid.uuid4() |
| dst_tmp = dst + str(my_uuid) |
| makedirs(os.path.dirname(dst), exist_ok=True) |
| if with_permissions: |
| shutil.copy(src, dst_tmp) |
| else: |
| shutil.copyfile(src, dst_tmp) |
| atomic_move(dst_tmp, dst) |
| remove(dst_tmp) |
|
|
|
|
| def makedirs(path, exist_ok=True): |
| """ |
| Avoid some inefficiency in os.makedirs() |
| :param path: |
| :param exist_ok: |
| :return: |
| """ |
| if os.path.isdir(path) and os.path.exists(path): |
| assert exist_ok, "Path already exists" |
| return path |
| os.makedirs(path, exist_ok=exist_ok) |
|
|
|
|
| |
| |
| def test_prep_instruct_vicuna(): |
| from datasets import load_dataset |
| filename = 'ShareGPT_unfiltered_cleaned_split.json' |
| if not os.path.exists(filename): |
| os.system( |
| 'wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/%s' % filename) |
| data = load_dataset("json", data_files={"train": filename})["train"] |
| training_rows = [] |
| for i in range(data.num_rows): |
| conversations = data[i]['conversations'] |
| assert isinstance(conversations, list), conversations |
| convo = "" |
| for j, conv in enumerate(conversations): |
| |
| |
| if conv['from'] == 'human': |
| FROM = '<human>: ' |
| elif conv['from'] == 'gpt': |
| FROM = '<bot>: ' |
| convo += f"{FROM}" + conv['value'] + "\n" |
| if convo: |
| training_rows.append(dict(input=convo)) |
| with open(filename + ".generate_human_bot.train_plain.json", "wt") as f: |
| f.write(json.dumps(training_rows, indent=2)) |
|
|
|
|
| POSTFIX = ".generate_human_bot.train_plain.json" |
|
|
| |
| OIG_DATASETS = [ |
| "unified_chip2.jsonl", |
| "unified_grade_school_math_instructions.jsonl", |
| "unified_poetry_2_song.jsonl", |
| "unified_plot_screenplay_books_dialog.jsonl", |
| ] |
|
|
| |
| ALL_OIG_DATASETS = ['unified_abstract_infill.jsonl', |
| 'unified_basic.jsonl', |
| 'unified_canadian_parliament.jsonl', |
| 'unified_chip2.jsonl', |
| 'unified_conv_finqa.jsonl', |
| 'unified_cuad.jsonl', |
| 'unified_essays.jsonl', |
| 'unified_flan.jsonl.gz', |
| 'unified_grade_school_math_instructions.jsonl', |
| 'unified_hc3_human.jsonl', |
| 'unified_image_prompts_instructions.jsonl', |
| 'unified_joke_explanations.jsonl', |
| 'unified_mathqa_flanv2_kojma_cot.jsonl', |
| 'unified_merged_code_xp3.jsonl', |
| 'unified_multi_news.jsonl', |
| 'unified_multi_sum.jsonl', |
| 'unified_ni.jsonl.gz', |
| 'unified_nq.jsonl', |
| 'unified_openai_summarize_tldr.jsonl', |
| 'unified_oscar_en_sample_dialog.jsonl', |
| 'unified_p3.jsonl.gz', |
| 'unified_plot_screenplay_books_dialog.jsonl', |
| 'unified_poetry_2_song.jsonl', |
| 'unified_poetry_instructions.jsonl', |
| 'unified_rallio_safety_and_prosocial.jsonl', |
| 'unified_rallio_soda_upgraded_2048.jsonl', |
| 'unified_soda_dialog.jsonl', |
| 'unified_sqlv1.jsonl', |
| 'unified_sqlv2.jsonl', |
| 'unified_squad_v2.jsonl', |
| 'unified_squad_v2_more_neg.jsonl', |
| 'unified_ul2_plus_oscar_en_sample_dialog.jsonl', |
| 'unified_unifiedskg_instructions.jsonl', |
| 'unified_unnatural_instructions.jsonl', |
| 'unified_xp3_sample.jsonl'] |
|
|
| useful_oig_files = ['unified_rallio_safety_and_prosocial.jsonl.parquet', |
| 'unified_chip2.jsonl.parquet', |
| 'unified_cuad.jsonl.parquet', |
| 'unified_essays.jsonl.parquet', |
| 'unified_flan.jsonl.gz.parquet', |
| 'unified_grade_school_math_instructions.jsonl.parquet', |
| 'unified_hc3_human.jsonl.parquet', |
| 'unified_mathqa_flanv2_kojma_cot.jsonl.parquet', |
| 'unified_merged_code_xp3.jsonl.parquet', |
| 'unified_multi_news.jsonl.parquet', |
| |
| 'unified_ni.jsonl.gz.parquet', |
| 'unified_openai_summarize_tldr.jsonl.parquet', |
| |
| 'unified_plot_screenplay_books_dialog.jsonl.parquet', |
| 'unified_soda_dialog.jsonl.parquet', |
| 'unified_unnatural_instructions.jsonl.parquet', |
| ] |
|
|
|
|
| @pytest.mark.parametrize("filename", OIG_DATASETS) |
| def test_get_small_sample_oig_data(filename): |
| if not os.path.exists(filename): |
| os.system('wget https://huggingface.co/datasets/laion/OIG/resolve/main/%s' % filename) |
| import json |
| rows = [] |
| with open(filename, "r") as f: |
| for line in f.readlines(): |
| row = json.loads(line) |
| rows.append(dict(input=row["text"])) |
| with open(filename + POSTFIX, "w") as f: |
| f.write(json.dumps(rows, indent=2)) |
|
|
|
|
| @pytest.mark.parametrize("filename", ALL_OIG_DATASETS) |
| def test_download_useful_data_as_parquet(filename): |
| dest_file = filename + '.parquet' |
| if dest_file not in useful_oig_files: |
| pytest.skip('file declared not useful') |
| if not os.path.exists(filename): |
| os.system('wget https://huggingface.co/datasets/laion/OIG/resolve/main/%s' % filename) |
| if not os.path.exists(dest_file): |
| df = pd.read_json(path_or_buf=filename, lines=True) |
| df.to_parquet(dest_file, index=False) |
|
|
|
|
| def test_merge_shuffle_small_sample_oig_data(): |
| np.random.seed(1234) |
| rows = [] |
| for filename in OIG_DATASETS: |
| with open(filename + POSTFIX, "r") as f: |
| rows.extend(json.loads(f.read())) |
| np.random.shuffle(rows) |
| with open("merged_shuffled_OIG_%s.json" % hashlib.sha256(str(OIG_DATASETS).encode()).hexdigest()[:10], "w") as f: |
| f.write(json.dumps(rows, indent=2)) |
|
|
|
|
| def test_join_jsons(): |
| files = ['config.json'] * 1 + \ |
| ['dai_docs.train_cleaned.json'] * 2 + \ |
| ['dai_faq.json'] * 3 |
| print(files) |
| lst = [] |
| [lst.extend(json.load(open(fil, 'rt'))) for fil in files] |
| print(len(lst)) |
| json.dump(lst, open("merged.json", "wt"), indent=2) |
|
|
|
|
| @pytest.mark.parametrize("filename", ['Anthropic/hh-rlhf']) |
| def test_make_rlhf_good_data(filename): |
| from datasets import load_dataset |
| rows = load_dataset(filename)["train"]["chosen"] |
| new_rows = [] |
| for row in rows: |
| if row[:2] == "\n\n": |
| row = row[2:] |
| row = row.replace("Human: ", "<human>: ") |
| row = row.replace("Assistant: ", "<bot>: ") |
| new_rows.append(dict(input=row)) |
| with open(filename.replace("/", "_") + POSTFIX, "w") as f: |
| f.write(json.dumps(new_rows, indent=2)) |
|
|
|
|
| def test_show_prompts(): |
| files = ['config.json'] * 1 + \ |
| ['dai_docs.train_cleaned.json'] * 1 + \ |
| ['dai_faq.json'] * 1 |
| file_points = [json.load(open(fil, 'rt')) for fil in files] |
| from prompter import generate_prompt |
| for data_points in file_points: |
| for data_point in data_points: |
| print(generate_prompt(data_point, 'plain', '', False, False)[0]) |
|
|
|
|
| def test_get_open_datasets(): |
| |
| open_tags = ['license:Apache License 2.0', |
| 'license:mit', |
| 'license:apache', |
| 'license:apache2', |
| 'license:apache-2.0', |
| 'license:bsd', |
| 'license:bsd-2-clause', |
| 'license:bsd-3-clause', |
| 'license:bsd-3-clause-clear', |
| 'license:lgpl-2.1', |
| 'license:lgpl-3.0', |
| 'license:lgpl-lr', |
| 'license:lgpl', |
| 'license:openrail++', |
| 'license:openrail', |
| 'license:bigscience-bloom-rail-1.0', |
| |
| 'license:other', |
| 'license:unknown', |
| |
| |
| 'license:odc-by', |
| 'license:cc-by-4.0', |
| 'license:cc-by-3.0', |
| 'license:cc-by-2.0', |
| 'license:cc-by-2.5', |
| |
| 'license:odbl', |
| 'license:pddl', |
| 'license:ms-pl', |
| 'license:zlib', |
| ] |
| |
|
|
| from huggingface_hub import list_datasets |
| datasets = flatten_list([[x for x in list_datasets(filter=y)] for y in open_tags]) |
| datasets += [x for x in list_datasets(author='openai')] |
| |
| all_license_tags = set(flatten_list([[y for y in x.tags if 'license' in y] for x in datasets])) |
| print(len(all_license_tags)) |
| open_datasets = [x for x in datasets if any([y in x.tags for y in open_tags]) or 'license:' not in str(x.tags)] |
| print('open_datasets', len(open_datasets)) |
| all_task_tags = set(flatten_list([[y for y in x.tags if 'task' in y] for x in open_datasets])) |
| print('all_task_tags', len(all_task_tags)) |
| excluded_tags = ['image', 'hate', 'tabular', 'table-', 'classification', 'retrieval', |
| 'translation', 'identification', 'object', 'mask', 'to-text', |
| 'face-detection', 'audio', 'voice', 'reinforcement', 'depth-est', |
| 'forecasting', 'parsing', 'visual', 'speech', 'multiple-choice', |
| 'slot-filling', 'irds/argsme', '-scoring', 'other', 'graph-ml', |
| 'feature-extraction', 'keyword-spotting', |
| 'coreference-resolution', 'segmentation', |
| 'word-sense-disambiguation', |
| 'lemmatization'] |
| task_tags = [x.replace('task_categories:', '').replace('task_ids:', '') |
| for x in all_task_tags if not any([y in x for y in |
| excluded_tags])] |
| print('task_tags', len(task_tags)) |
| |
| open_tasked_datasets = [x for x in open_datasets if |
| any([y in str([x for x in x.tags if 'task' in x]) for y in task_tags]) and |
| not any([y in str([x for x in x.tags if 'task' in x]) for y in excluded_tags]) or |
| 'task_categories' not in str(x.tags) and 'task_ids' not in str(x.tags)] |
| open_tasked_datasets = [x for x in open_tasked_datasets if not x.disabled] |
| open_tasked_datasets = [x for x in open_tasked_datasets if not x.gated] |
| open_tasked_datasets = [x for x in open_tasked_datasets if not x.private] |
| print('open_tasked_datasets', len(open_tasked_datasets)) |
| sizes = list(set(flatten_list([[(y, x.id) for y in x.tags if 'size' in y] for x in open_tasked_datasets]))) |
| languages = list(set(flatten_list([[(y, x.id) for y in x.tags if 'language:' in y] for x in open_tasked_datasets]))) |
| open_english_tasked_datasets = [x for x in open_tasked_datasets if |
| 'language:' not in str(x.tags) or |
| 'language:en' in str(x.tags)] |
| small_open_english_tasked_datasets = [x for x in open_english_tasked_datasets if |
| 'n<1K' in str(x.tags) or |
| '1K<n<10K' in str(x.tags) or |
| '1K0<n<100K' in str(x.tags) or |
| '100K<n<1M' in str(x.tags) or |
| 'size_category' not in str(x.tags) |
| ] |
| |
| |
| ids = [x.id for x in small_open_english_tasked_datasets] |
|
|
| |
| |
| assert 'alespalla/chatbot_instruction_prompts' in ids |
| assert 'laion/OIG' in ids |
| assert 'openai/webgpt_comparisons' in ids |
| assert 'openai/summarize_from_feedback' in ids |
| assert 'Anthropic/hh-rlhf' in ids |
|
|
| |
| |
|
|
| print('open_english_tasked_datasets: ', ids, flush=True) |
|
|
| exclude_ids = ['allenai/nllb', |
| 'hf-internal-testing/fixtures_image_utils', |
| 'allenai/c4', |
| 'agemagician/uniref50', |
| 'huggingface-course/documentation-images', |
| 'smilegate-ai/kor_unsmile', |
| 'MohamedRashad/ChatGPT-prompts', |
| 'humarin/chatgpt-paraphrases', |
| 'Jeska/vaccinchat', |
| 'alespalla/chatbot_instruction_prompts', |
| 'allenai/prosocial-dialog', |
| |
| 'AlekseyKorshuk/persona-chat', |
| 'bavard/personachat_truecased', |
| 'adamlin/daily_dialog', |
| 'adamlin/FewShotWoz', |
| 'benjaminbeilharz/better_daily_dialog', |
| 'benjaminbeilharz/daily_dialog_w_turn_templates', |
| 'benjaminbeilharz/empathetic_dialogues_for_lm', |
| 'GEM-submissions/GEM__bart_base_schema_guided_dialog__1645547915', |
| 'ia-bentebib/conv_ai_2_fr', |
| 'ia-bentebib/daily_dialog_fr', |
| 'ia-bentebib/dialog_re_fr', |
| 'ia-bentebib/empathetic_dialogues_fr', |
| 'roskoN/dailydialog', |
| 'VadorMazer/skyrimdialogstest', |
| 'bigbio/med_qa', |
| 'biu-nlp/qa_srl2018', |
| 'biu-nlp/qa_discourse', |
| 'iarfmoose/qa_evaluator', |
| 'jeopardy', |
| 'narrativeqa', |
| 'nomic-ai/gpt4all_prompt_generations', |
| 'nomic-ai/gpt4all_prompt_generations_with_p3', |
| 'HuggingFaceH4/alpaca', |
| 'tatsu-lab/alpaca', |
| 'yahma/alpaca-cleaned', |
| 'Hello-SimpleAI/HC3', |
| 'glue', |
| 'sahil2801/CodeAlpaca-20k', |
| 'Short-Answer-Feedback/saf_communication_networks_english', |
| ] |
| small_open_english_tasked_datasets = [x for x in small_open_english_tasked_datasets if x.id not in exclude_ids] |
| |
| small_open_english_tasked_datasets = [x for x in small_open_english_tasked_datasets if 'speech' not in x.id] |
| |
| small_open_english_tasked_datasets = [x for x in small_open_english_tasked_datasets if |
| 'hf-internal-testing' not in x.id] |
| small_open_english_tasked_datasets = [x for x in small_open_english_tasked_datasets if |
| 'chinese' not in x.id] |
|
|
| sorted_small_open_english_tasked_datasets = sorted([(x.downloads, x) for x in small_open_english_tasked_datasets], |
| key=lambda x: x[0], reverse=True) |
|
|
| |
| |
| |
| |
| |
| |
|
|
| """ |
| https://huggingface.co/datasets/wikihow/blob/main/wikihow.py |
| https://github.com/mahnazkoupaee/WikiHow-Dataset |
| https://ucsb.box.com/s/ap23l8gafpezf4tq3wapr6u8241zz358 |
| https://ucsb.app.box.com/s/ap23l8gafpezf4tq3wapr6u8241zz358 |
| """ |
|
|
| """ |
| # some ambiguous or non-commercial datasets |
| https://github.com/PhoebusSi/alpaca-CoT |
| """ |
|
|
| timeout = 3 * 60 |
| |
| for num_downloads, dataset in sorted_small_open_english_tasked_datasets: |
| data_id = dataset.id |
| func = do_one |
| args = (data_id, num_downloads) |
| kwargs = {} |
| with ProcessPoolExecutor(max_workers=1) as executor: |
| future = executor.submit(func, *args, **kwargs) |
| try: |
| future.result(timeout=timeout) |
| except concurrent.futures.TimeoutError: |
| print("\n\ndata_id %s timeout\n\n" % data_id, flush=True) |
| for child in psutil.Process(os.getpid()).children(recursive=True): |
| os.kill(child.pid, signal.SIGINT) |
| os.kill(child.pid, signal.SIGTERM) |
| os.kill(child.pid, signal.SIGKILL) |
|
|
|
|
| def do_one(data_id, num_downloads): |
| from datasets import load_dataset |
| out_file = "data_%s.parquet" % str(data_id.replace('/', '_')) |
| if os.path.isfile(out_file) and os.path.getsize(out_file) > 1024 ** 3: |
| return |
| try: |
| print("Loading data_id %s num_downloads: %s" % (data_id, num_downloads), flush=True) |
| avail_list = None |
| try: |
| data = load_dataset(data_id, 'foobar') |
| except Exception as e: |
| if 'Available: ' in str(e): |
| avail_list = ast.literal_eval(str(e).split('Available:')[1].strip()) |
| else: |
| avail_list = None |
| if avail_list is None: |
| avail_list = [None] |
| print("%s avail_list: %s" % (data_id, avail_list), flush=True) |
|
|
| for name in avail_list: |
| out_file = "data_%s_%s.parquet" % (str(data_id.replace('/', '_')), str(name)) |
| if os.path.isfile(out_file): |
| continue |
| data = load_dataset(data_id, name) |
| column_names_dict = data.column_names |
| column_names = column_names_dict[list(column_names_dict.keys())[0]] |
| print("Processing data_id %s num_downloads: %s columns: %s" % (data_id, num_downloads, column_names), |
| flush=True) |
| data_dict = data.data |
| col_dict = data.num_columns |
| first_col = list(col_dict.keys())[0] |
| if 'train' in data_dict: |
| df = data['train'].to_pandas() |
| else: |
| df = data[first_col].to_pandas() |
| |
| df.to_parquet(out_file, index=False) |
| except Exception as e: |
| t, v, tb = sys.exc_info() |
| ex = ''.join(traceback.format_exception(t, v, tb)) |
| print("Exception: %s %s" % (data_id, ex), flush=True) |
|
|
|
|
| def test_otherlic(): |
| from huggingface_hub import list_datasets |
| lic = ['license:odc-by', |
| 'license:cc-by-4.0', |
| 'license:cc-by-3.0', |
| 'license:cc-by-2.0', |
| 'license:cc-by-2.5', |
| 'license:cc-by-sa-4.0', |
| 'license:odbl', |
| 'license:pddl', |
| 'license:ms-pl', |
| 'license:zlib', |
| ] |
| datasets = flatten_list([[x for x in list_datasets(filter=y) if 'translation' not in str(x.tags)] for y in lic]) |
| print(len(datasets)) |
|
|
|
|
| |
| |
| useful = ['Dahoas/instruct-human-assistant-prompt', |
| 'Dahoas/first-instruct-human-assistant-prompt', |
| 'knkarthick/dialogsum', |
| 'McGill-NLP/FaithDial', |
| 'Zaid/quac_expanded', |
| '0-hero/OIG-small-chip2', |
| 'alistvt/coqa-flat', |
| 'AnonymousSub/MedQuAD_47441_Question_Answer_Pairs', |
| 'Anthropic/hh-rlhf', |
| 'arjunth2001/online_privacy_qna', |
| 'Dahoas/instruct_helpful_preferences', |
| 'Dahoas/rl-prompt-dataset', |
| 'Dahoas/rm-static', |
| 'Dahoas/static-hh', |
| 'Dahoas/synthetic-instruct-gptj-pairwise', |
| 'eli5', |
| 'gsm8k', |
| 'guanaco/guanaco', |
| 'kastan/rlhf-qa-comparisons', |
| 'kastan/rlhf-qa-conditional-generation-v2', |
| 'OllieStanley/humaneval-mbpp-codegen-qa', |
| 'OllieStanley/humaneval-mbpp-testgen-qa', |
| 'Graverman/Instruct-to-Code', |
| 'openai/summarize_from_feedback', |
| 'relbert/analogy_questions', |
| 'yitingxie/rlhf-reward-datasets', |
| 'yizhongw/self_instruct', |
| 'HuggingFaceH4/asss', |
| 'kastan/rlhf-qa-conditional-generation-v2', |
| 'cosmos_qa', |
| 'vishal-burman/c4-faqs', |
| 'squadshifts', |
| 'hotpot_qa', |
| 'adversarial_qa', |
| 'allenai/soda', |
| 'squad_v2', |
| 'squadshifts', |
| 'dferndz/cSQuAD1', |
| 'dferndz/cSQuAD2', |
| 'din0s/msmarco-nlgen', |
| 'domenicrosati/TruthfulQA', |
| 'hotpot_qa', |
| 'HuggingFaceH4/self-instruct-eval', |
| 'kastan/EE_QA_for_RLHF', |
| 'KK04/LogicInference_OA', |
| 'lmqg/qa_squadshifts_synthetic', |
| 'lmqg/qg_squad', |
| 'lmqg/qg_squadshifts', |
| 'lmqg/qg_subjqa', |
| 'pszemraj/HC3-textgen-qa', |
| |
| 'pythonist/newdata', |
| 'ropes', |
| 'wikitablequestions', |
| 'bigscience/p3', |
| ] |
|
|
| code_useful = ['0n1xus/codexglue', |
| 'openai_humaneval', |
| 'koutch/staqc', |
| ] |
|
|
| maybe_useful = ['AlekseyKorshuk/comedy-scripts', |
| 'openbookqa', |
| 'qed', |
| 'selqa', |
| 'HuggingFaceH4/instruction-pilot-outputs-filtered', |
| 'GBaker/MedQA-USMLE-4-options', |
| 'npc-engine/light-batch-summarize-dialogue', |
| ] |
|
|
| summary_useful = ['austin/rheum_abstracts', |
| 'CarperAI/openai_summarize_comparisons', |
| 'CarperAI/openai_summarize_tldr', |
| 'ccdv/cnn_dailymail', |
| 'ccdv/govreport-summarization', |
| 'ccdv/pubmed-summarization', |
| 'duorc', |
| 'farleyknight/big_patent_5_percent', |
| 'multi_news', |
| 'opinosis', |
| 'SophieTr/reddit_clean', |
| 'allenai/mup', |
| 'allenai/multi_lexsum', |
| 'big_patent', |
| 'allenai/wcep_dense_max', |
| 'awinml/costco_long_practice', |
| 'GEM/xsum', |
| 'ratishsp/newshead', |
| 'RussianNLP/wikiomnia', |
| 'stacked-summaries/stacked-xsum-1024', |
| ] |
|
|
| math_useful = [ |
| 'competition_math' |
| ] |
|
|
| skipped = ['c4', |
| ] |
|
|
| """ |
| To get training data from oig: |
| pytest test_oig test_grade_final test_finalize_to_json |
| """ |
|
|
| human = '<human>:' |
| bot = '<bot>:' |
|
|
|
|
| def test_assemble_and_detox(): |
| import re |
| from profanity_check import predict_prob |
| df_list = [] |
| for data in useful_oig_files: |
| print("Processing %s" % data, flush=True) |
| df = pd.read_parquet(data) |
| df = df.reset_index(drop=True) |
| |
| text_list = df[['text']].values.ravel().tolist() |
| new_text = [] |
| max_len = 2048 |
| MAX_LEN = 2048 // 2 - 30 |
| for text in tqdm(text_list): |
| human_starts = [m.start() for m in re.finditer('<human>: ', text)] |
| if len(human_starts) == 1: |
| human_starts = [0, len(text)] |
| blurb = '' |
| for i in range(len(human_starts) - 1): |
| interaction = text[human_starts[i]: human_starts[i + 1]][:max_len] |
| blurb += interaction |
| if len(blurb) >= MAX_LEN: |
| blurb = get_sentences(blurb, length=MAX_LEN)[0] |
| new_text.append(blurb + "\n<human>:") |
| blurb = '' |
| if blurb: |
| blurb = get_sentences(blurb, length=MAX_LEN)[0] |
| new_text.append(blurb + "\n<human>:") |
|
|
| if len(new_text) > len(text_list): |
| print("Added %d new rows (before: %d)" % (len(new_text) - df.shape[0], df.shape[0])) |
| df = pd.DataFrame({"text": new_text, "source": [data] * len(new_text)}) |
| df = df.drop_duplicates(keep='first') |
| print(df['text'].apply(lambda x: len(x)).describe()) |
| assert df['text'].apply(lambda x: len(x)).max() <= 2 * max_len |
|
|
| |
| df['profanity'] = predict_prob(df['text']) |
| before_rows = df.shape[0] |
| df = df[df['profanity'] < 0.25] |
| after_rows = df.shape[0] |
| print("Dropped %d rows out of %d due to alt-profanity-check" % (before_rows - after_rows, before_rows)) |
| df_list.append(df) |
| print("Done processing %s -> %s rows" % (data, df.shape[0]), flush=True) |
| print("So far have %d rows" % sum([len(x) for x in df_list])) |
| df_final = pd.concat(df_list) |
| df_final = df_final.sample(frac=1, random_state=1234).reset_index(drop=True) |
| df_final.to_parquet('h2oGPT.cleaned.human_bot.shorter.parquet', index=False) |
|
|
|
|
| def test_basic_cleaning(): |
| |
| |
| from profanity_check import predict |
| df_list = [] |
| for data in useful_oig_files: |
| |
| |
| print("Processing %s" % data, flush=True) |
| df = pd.read_parquet(data) |
| df = df.reset_index(drop=True) |
| |
| |
| df['avg_words'] = df['text'].apply(lambda x: x.count(' ') / (x.count(human) + x.count(bot)) / 2.0) |
| df['avg_bot_words'] = df['text'].apply(lambda x: x.split(bot)[1].count(' ') / x.count(bot)) |
| |
| |
| res = predict(df['text']) |
| df['bad_words'] = res |
| df = df.reset_index(drop=True) |
| df = df[df['bad_words'] == 0] |
| df = df[['text', 'avg_words', 'avg_bot_words']] |
| df = df.drop_duplicates(keep='first') |
| print(df[df['avg_words'] == df['avg_words'].max()]['text'].values) |
| median_words = np.median(df['avg_words']) |
| min_words_per_entity = max(30, 0.8 * median_words) |
| max_words_per_entity = 2048 |
| df = df[df['avg_words'] > min_words_per_entity] |
| df = df[df['avg_words'] < max_words_per_entity] |
|
|
| min_words_per_entity = max(20, 0.5 * median_words) |
| max_words_per_entity = 2048 |
| df = df[df['avg_bot_words'] > min_words_per_entity] |
| df = df[df['avg_bot_words'] < max_words_per_entity] |
|
|
| df_list.append(df) |
| print("Done processing %s -> %s rows" % (data, df.shape[0]), flush=True) |
| df_final = pd.concat(df_list) |
| df_final.to_parquet('h2oGPT.cleaned.human_bot.parquet', index=False) |
|
|
|
|
| from joblib import Parallel, delayed, effective_n_jobs |
| from sklearn.utils import gen_even_slices |
| from sklearn.utils.validation import _num_samples |
|
|
|
|
| def parallel_apply(df, func, n_jobs=-1, **kwargs): |
| """ Pandas apply in parallel using joblib. |
| Uses sklearn.utils to partition input evenly. |
| |
| Args: |
| df: Pandas DataFrame, Series, or any other object that supports slicing and apply. |
| func: Callable to apply |
| n_jobs: Desired number of workers. Default value -1 means use all available cores. |
| **kwargs: Any additional parameters will be supplied to the apply function |
| |
| Returns: |
| Same as for normal Pandas DataFrame.apply() |
| |
| """ |
|
|
| if effective_n_jobs(n_jobs) == 1: |
| return df.apply(func, **kwargs) |
| else: |
| ret = Parallel(n_jobs=n_jobs)( |
| delayed(type(df).apply)(df[s], func, **kwargs) |
| for s in gen_even_slices(_num_samples(df), effective_n_jobs(n_jobs))) |
| return pd.concat(ret) |
|
|
|
|
| def add_better_profanity_flag(df): |
| from better_profanity import profanity |
| df['better_profanity'] = parallel_apply( |
| df['text'], |
| lambda x: profanity.contains_profanity(x), |
| n_jobs=-1, |
| ) |
| return df |
|
|
|
|
| def add_textstat_grade(df): |
| import textstat |
|
|
| def myfunc(x): |
| return textstat.flesch_kincaid_grade(x) |
|
|
| if False: |
| import dask.dataframe as dd |
| |
| ddata = dd.from_pandas(df, npartitions=120) |
|
|
| df['flesch_grade'] = ddata['text'].apply(myfunc).compute() |
| if True: |
| |
| df['flesch_grade'] = parallel_apply(df['text'], myfunc, n_jobs=-1) |
| return df |
|
|
|
|
| def add_deberta_grade(df): |
| from transformers import AutoModelForSequenceClassification, AutoTokenizer |
| import torch |
| reward_name = "OpenAssistant/reward-model-deberta-v3-large-v2" |
| rank_model, tokenizer = AutoModelForSequenceClassification.from_pretrained( |
| reward_name), AutoTokenizer.from_pretrained(reward_name) |
| device = 'cuda' if torch.cuda.is_available() else 'cpu' |
| rank_model.to(device) |
|
|
| def get_question(x): |
| return x.replace('<human>: ', '').split('<bot>:')[0] |
|
|
| def get_answer(x): |
| try: |
| answer = x.split('<bot>: ')[1].split('<human>:')[0].replace('<bot>: ', '') |
| except: |
| answer = x.split('<bot>:')[1].split('<human>:')[0].replace('<bot>:', '') |
| return answer |
|
|
| df['question'] = parallel_apply(df['text'], get_question, n_jobs=-1) |
| df['answer'] = parallel_apply(df['text'], get_answer, n_jobs=-1) |
|
|
| from datasets import Dataset |
| from transformers import pipeline |
| from transformers.pipelines.pt_utils import KeyPairDataset |
| import tqdm |
|
|
| pipe = pipeline( |
| "text-classification", |
| model=reward_name, |
| device="cuda:0" if torch.cuda.is_available() else "cpu" |
| ) |
| start = 0 |
| batch_size = 64 * 16 |
| micro_batch = orig_micro_batch = 16 |
| end = 0 |
| import socket |
| checkpoint = "grades.%s.pkl" % socket.gethostname() |
| grades = [] |
| import pickle |
| if os.path.exists(checkpoint): |
| with open(checkpoint, "rb") as f: |
| start, grades = pickle.loads(f.read()) |
| last_oom = 0 |
| while end < df.shape[0]: |
| |
| end = min(start + batch_size, df.shape[0]) |
| if start == end: |
| break |
| dataset = Dataset.from_pandas(df.iloc[start:end, :]) |
| try: |
| grades.extend([ |
| x['score'] for x in tqdm.tqdm( |
| pipe(KeyPairDataset(dataset, "question", "answer"), batch_size=micro_batch) |
| ) |
| ]) |
| except torch.cuda.OutOfMemoryError: |
| last_oom = start |
| micro_batch = max(1, micro_batch // 2) |
| print("OOM - retrying with micro_batch=%d" % micro_batch) |
| continue |
| if last_oom == start: |
| micro_batch = orig_micro_batch |
| print("Returning to micro_batch=%d" % micro_batch) |
| assert len(grades) == end |
| start = end |
| with open(checkpoint, "wb") as f: |
| f.write(pickle.dumps((end, grades))) |
| print("%d/%d" % (end, df.shape[0])) |
| df['grade_deberta'] = grades |
| if os.path.exists(checkpoint): |
| os.remove(checkpoint) |
| return df |
|
|
|
|
| def test_chop_by_lengths(): |
| file = "h2oGPT.cleaned.human_bot.shorter.parquet" |
| df = pd.read_parquet(file).reset_index(drop=True) |
| df = count_human_bot_lengths(df) |
| df['rand'] = np.random.rand(df.shape[0]) |
| df['rand2'] = np.random.rand(df.shape[0]) |
| before_rows = df.shape[0] |
| |
| df = df[(df['len_human_mean'] > 20)] |
| df = df[(df['len_human_mean'] > 30) | (df['rand'] < 0.2)] |
| df = df[(df['len_human_mean'] > 50) | (df['rand'] < 0.5)] |
| df = df[(df['len_human_max'] < 10000)] |
| df = df[(df['len_bot_mean'] > 20)] |
| df = df[(df['len_bot_mean'] > 30) | (df['rand2'] < 0.2)] |
| df = df[(df['len_bot_mean'] > 50) | (df['rand2'] < 0.5)] |
| df = df[(df['len_bot_max'] < 10000)] |
| assert df['text'].apply(lambda x: len(x)).max() < 20000 |
| df = df.drop(['rand', 'rand2'], axis=1) |
| after_rows = df.shape[0] |
| print("Chopped off %d out of %d rows due to length" % (before_rows - after_rows, before_rows)) |
| print(df.describe()) |
| df.to_parquet('h2oGPT.cleaned.chopped.human_bot.shorter.parquet', index=False) |
|
|
|
|
| def count_human_bot_lengths(df, human=None, bot=None): |
| import re |
| len_human_min = [] |
| len_human_max = [] |
| len_human_mean = [] |
| len_bot_min = [] |
| len_bot_max = [] |
| len_bot_mean = [] |
| human = human or '<human>:' |
| bot = bot or '<bot>:' |
| for is_human in [True, False]: |
| what = human if is_human else bot |
| other = human if not is_human else bot |
| for i in range(df.shape[0]): |
| text = df.loc[i, 'text'] |
| assert isinstance(text, str) |
| starts = [m.start() for m in re.finditer(what, text)] |
| if len(starts) == 1: |
| starts = [starts[0], len(text)] |
| assert len(text) |
| list_what = [] |
| for ii in range(len(starts) - 1): |
| interaction = text[starts[ii]: starts[ii + 1]] |
| if other in interaction: |
| interaction = interaction[:interaction.find(other)] |
| interaction.strip() |
| list_what.append(interaction) |
| if not list_what: |
| list_what = [''] |
| if is_human: |
| len_human_min.append(min([len(x) for x in list_what])) |
| len_human_max.append(max([len(x) for x in list_what])) |
| len_human_mean.append(np.mean([len(x) for x in list_what])) |
| else: |
| len_bot_min.append(min([len(x) for x in list_what])) |
| len_bot_max.append(max([len(x) for x in list_what])) |
| len_bot_mean.append(np.mean([len(x) for x in list_what])) |
| df['len_human_min'] = len_human_min |
| df['len_human_max'] = len_human_max |
| df['len_human_mean'] = len_human_mean |
| df['len_bot_min'] = len_bot_min |
| df['len_bot_max'] = len_bot_max |
| df['len_bot_mean'] = len_bot_mean |
| np.random.seed(1234) |
| pd.set_option('display.max_columns', None) |
| print("Before chopping") |
| print(df.describe()) |
| return df |
|
|
|
|
| def test_grade(): |
| df = None |
|
|
| file = "h2oGPT.cleaned.chopped.human_bot.shorter.parquet" |
| output_file = "h2oGPT.cleaned.graded1.human_bot.shorter.parquet" |
| if not os.path.exists(output_file): |
| if df is None: |
| df = pd.read_parquet(file).reset_index(drop=True) |
| df = add_textstat_grade(df) |
| min_grade = 10 |
| max_grade = 25 |
| df = df[df['flesch_grade'] >= min_grade] |
| df = df[df['flesch_grade'] <= max_grade] |
| print("After Flesch grade") |
| print(df.describe()) |
| df.to_parquet(output_file, index=False) |
|
|
| file = output_file |
| output_file = "h2oGPT.cleaned.graded2.human_bot.shorter.parquet" |
| if not os.path.exists(output_file): |
| |
| if df is None: |
| df = pd.read_parquet(file).reset_index(drop=True) |
| df = add_better_profanity_flag(df) |
| before_rows = df.shape[0] |
| df = df[df['better_profanity'] == 0] |
| df = df.drop(['better_profanity'], axis=1) |
| after_rows = df.shape[0] |
| print("Dropped %d rows out of %d due to better_profanity" % (before_rows - after_rows, before_rows)) |
| print(df.describe()) |
| df.to_parquet(output_file, index=False) |
|
|
| file = output_file |
| output_file = 'h2oGPT.cleaned.graded3.human_bot.shorter.parquet' |
| if not os.path.exists(output_file): |
| if df is None: |
| df = pd.read_parquet(file).reset_index(drop=True) |
| df = add_deberta_grade(df) |
| min_grade = 0.3 |
| max_grade = np.inf |
| before_rows = df.shape[0] |
| df = df[df['grade_deberta'] >= min_grade] |
| df = df[df['grade_deberta'] <= max_grade] |
| after_rows = df.shape[0] |
| print("Dropped %d rows out of %d due to deberta grade" % (before_rows - after_rows, before_rows)) |
| print("After DeBERTa grade") |
| print(df.describe()) |
| df.to_parquet(output_file, index=False) |
|
|
| file = output_file |
| output_file = 'h2oGPT.cleaned.graded.human_bot.shorter.parquet' |
| if df is None: |
| df = pd.read_parquet(file).reset_index(drop=True) |
| df.to_parquet(output_file, index=False) |
|
|
|
|
| @pytest.mark.parametrize( |
| "fixup_personality, only_personality, deberta_grading", |
| [ |
| |
| |
| [True, False, False], |
| |
| ] |
| ) |
| @pytest.mark.parametrize("prompt_type", ["llama2"]) |
| def test_add_open_assistant(fixup_personality, only_personality, deberta_grading, prompt_type, save_json=True): |
| """ |
| Flatten tree structure into one row per path from root to leaf |
| Also turn into human_bot prompting format: |
| <human>: question\n<bot>: answer <human>: question2\n<bot>: answer2 Etc. |
| Also saves a .json locally as side-effect |
| returns list of dicts, containing intput, prompt_type and source |
| """ |
| from datasets import load_dataset |
| data_file = "OpenAssistant/oasst1" |
| ds = load_dataset(data_file) |
| df = pd.concat([ds['train'].to_pandas(), ds['validation'].to_pandas()], axis=0) |
| rows = {} |
| message_ids = df['message_id'].values.tolist() |
| message_tree_ids = df['message_tree_id'].values.tolist() |
| parent_ids = df['parent_id'].values.tolist() |
| texts = df['text'].values.tolist() |
| roles = df['role'].values.tolist() |
| deleteds = df['deleted'].values.tolist() |
| for i in range(df.shape[0]): |
| |
| message_id = message_ids[i] |
| message_tree_id = message_tree_ids[i] |
| parent_id = parent_ids[i] |
| text = texts[i] |
| deleted = deleteds[i] |
| if deleted: |
| continue |
| if fixup_personality: |
| text = text.replace("Open Assistant", "h2oGPT") |
| text = text.replace("Open-Assistant", "h2oGPT") |
| text = text.replace("open-assistant", "h2oGPT") |
| text = text.replace("OpenAssistant", "h2oGPT") |
| text = text.replace("open assistant", "h2oGPT") |
| text = text.replace("Open Assistand", "h2oGPT") |
| text = text.replace("Open Assitant", "h2oGPT") |
| text = text.replace("Open Assistent", "h2oGPT") |
| text = text.replace("Open Assisstant", "h2oGPT") |
| text = text.replace("Open Assitent", "h2oGPT") |
| text = text.replace("Open Assitiant", "h2oGPT") |
| text = text.replace("Open Assistiant", "h2oGPT") |
| text = text.replace("Open Assitan ", "h2oGPT ") |
| text = text.replace("Open Assistan ", "h2oGPT ") |
| text = text.replace("Open Asistant", "h2oGPT") |
| text = text.replace("Open Assiant", "h2oGPT") |
| text = text.replace("Assistant", "h2oGPT") |
| text = text.replace("LAION AI", "H2O.ai") |
| text = text.replace("LAION-AI", "H2O.ai") |
| text = text.replace("LAION,", "H2O.ai,") |
| text = text.replace("LAION.ai", "H2O.ai") |
| text = text.replace("LAION.", "H2O.ai.") |
| text = text.replace("LAION", "H2O.ai") |
|
|
| role = roles[i] |
| if prompt_type == "llama2": |
| new_data = ('[INST] ' if role == 'prompter' else ' [/INST] ') + text |
| if parent_id and role == 'prompter': |
| new_data = " " + new_data |
| elif prompt_type == "human_bot": |
| new_data = ('<human>: ' if role == 'prompter' else '<bot>: ') + text |
| else: |
| raise NotImplementedError("prompt_type not supported") |
| entry = dict(message_id=message_id, parent_id=parent_id, text=new_data) |
| if message_tree_id not in rows: |
| rows[message_tree_id] = [entry] |
| else: |
| rows[message_tree_id].append(entry) |
|
|
| all_rows = [] |
|
|
| for node_id in rows: |
| |
| conversations = [] |
|
|
| list_msgs = rows[node_id] |
| |
| while len(list_msgs): |
| for i, leaf in enumerate(list_msgs): |
| found = False |
| parent_id = leaf['parent_id'] |
| if parent_id is None: |
| |
| conversations.append(leaf) |
| found = True |
| else: |
| for conv in conversations: |
| |
| if parent_id in conv['message_id'] and parent_id != conv['message_id'][-len(parent_id):]: |
| |
| continue |
| if parent_id == conv['message_id'][-len(parent_id):]: |
| |
| conversations.append(conv.copy()) |
| if prompt_type == "llama2": |
| conv['text'] += f"""{leaf['text']}""" |
| elif prompt_type == "human_bot": |
| conv['text'] += f""" |
| {leaf['text']} |
| """ |
| else: |
| raise NotImplementedError |
| conv['message_id'] += leaf['message_id'] |
| found = True |
| break |
| if found: |
| |
| del list_msgs[i] |
| break |
|
|
| |
| for i, conv in enumerate(conversations): |
| for j, conv2 in enumerate(conversations): |
| if i == j: |
| continue |
| if conv['message_id'] and conv2['message_id']: |
| assert conv['message_id'] != conv2['message_id'] |
| |
| if conv['message_id'] in conv2['message_id']: |
| conv['message_id'] = None |
| if conv2['message_id'] in conv['message_id']: |
| conv2['message_id'] = None |
| conversations = [c for c in conversations if c['message_id']] |
| if only_personality: |
| if prompt_type == "human_bot": |
| all_rows.extend( |
| [dict(input=c['text'] + "\n<human>:", output="", prompt_type='plain', source=data_file) for c in conversations if |
| 'h2oGPT' in c['text']]) |
| elif prompt_type == "llama2": |
| all_rows.extend( |
| [dict(input=c['text'] + |
| ("" if c['text'].rfind("[/INST]") > c['text'].rfind("[INST]") else " [/INST]"), |
| output="", prompt_type='plain', source=data_file) for c in conversations if |
| 'h2oGPT' in c['text']]) |
| else: |
| raise NotImplementedError |
| else: |
| if prompt_type == "human_bot": |
| all_rows.extend( |
| [dict(input=c['text'] + "\n<human>:", output="", prompt_type='plain', source=data_file) for c in conversations |
| if |
| "What is H2O.ai" not in c['text']]) |
| elif prompt_type == "llama2": |
| all_rows.extend( |
| [dict(input=c['text'] + |
| (" " if c['text'].rfind("[/INST]") > c['text'].rfind("[INST]") else " [/INST]"), |
| output="", prompt_type='plain', source=data_file) for c in conversations if |
| "What is H2O.ai" not in c['text']]) |
| else: |
| raise NotImplementedError |
|
|
| unhelpful = get_unhelpful_list() |
| all_rows = [x for x in all_rows if not any(u in x['input'] for u in unhelpful)] |
| personality = create_personality_data(prompt_type=prompt_type) |
| all_rows.extend(personality * 10) |
| np.random.seed(123) |
| np.random.shuffle(all_rows) |
| print(len(all_rows)) |
| if deberta_grading: |
| df = pd.DataFrame(all_rows) |
| df = df.rename(columns={'input': 'text'}) |
| df = add_deberta_grade(df) |
| df = df.rename(columns={'text': 'input'}) |
| drop = True |
| if drop: |
| min_grade = 0.3 |
| max_grade = np.inf |
| before_rows = df.shape[0] |
| df = df[df['grade_deberta'] >= min_grade] |
| df = df[df['grade_deberta'] <= max_grade] |
| after_rows = df.shape[0] |
| print("Dropped %d rows out of %d due to deberta grade" % (before_rows - after_rows, before_rows)) |
| print("After DeBERTa grade") |
| print(df.describe()) |
| all_rows = [] |
| for i in range(df.shape[0]): |
| all_rows.append( |
| dict( |
| input=df['input'].iloc[i], |
| output=df['output'].iloc[i], |
| source=df['source'].iloc[i], |
| prompt_type=df['prompt_type'].iloc[i], |
| grade_deberta=df['grade_deberta'].iloc[i], |
| ) |
| ) |
| if save_json: |
| data_file = data_file + \ |
| ("_h2ogpt" if fixup_personality else "") + \ |
| ("_only" if only_personality else "") + \ |
| ("_graded" if deberta_grading else "") + \ |
| ("_llama2_chat" if prompt_type == "llama2" else "") |
| for i in range(len(all_rows)): |
| all_rows[i]['id'] = i |
| with open(data_file.lower().replace("/", "_") + ".json", "w") as f: |
| f.write(json.dumps(all_rows, indent=2)) |
| return all_rows |
|
|
|
|
| def test_finalize_to_json(): |
| df = pd.read_parquet('h2oGPT.cleaned.graded.human_bot.shorter.parquet') |
| df = df.rename(columns={'text': 'input'}) |
|
|
| print("Number of high-quality human_bot interactions: %s" % df.shape[0], flush=True) |
|
|
| print("Adding open assistant data") |
| with open("openassistant_oasst1_h2ogpt_graded.json") as f: |
| open_assistant = json.loads(f.read()) |
| df = pd.concat([df, pd.DataFrame(open_assistant)], axis=0) |
|
|
| def final_clean(df): |
| from better_profanity import profanity |
| profanity.load_censor_words_from_file("data/censor_words.txt") |
| df['profanity'] = parallel_apply( |
| df['input'], |
| lambda x: profanity.contains_profanity(x), |
| n_jobs=-1, |
| ) |
| return df[(df['profanity'] == 0)].reset_index(drop=True) |
|
|
| print("Before cleaning: Number of final high-quality human_bot interactions: %s" % df.shape[0], flush=True) |
| df = final_clean(df) |
| print("After cleaning: Number of final high-quality human_bot interactions: %s" % df.shape[0], flush=True) |
| print(df.describe()) |
| print(df.shape) |
| row_list = [] |
| for i in range(df.shape[0]): |
| row_list.append( |
| dict( |
| input=df.loc[i, 'input'], |
| source=df.loc[i, 'source'], |
| prompt_type='plain', |
| ) |
| ) |
| np.random.seed(1234) |
| np.random.shuffle(row_list) |
| unhelpful = get_unhelpful_list() |
| row_list = [x for x in row_list if not any(u in x['input'] for u in unhelpful)] |
| for i in range(len(row_list)): |
| row_list[i]['id'] = i |
| row_list[i]['input'] = row_list[i]['input'].replace(" <bot>:", "\n<bot>:") |
| with open('h2ogpt-oig-oasst1-instruct-cleaned-v3.json', "w") as f: |
| f.write(json.dumps(row_list, indent=2)) |
|
|
|
|
| def create_personality_data(prompt_type="llama2"): |
| questions = [ |
| "What's your name?", |
| "What is your name?", |
| "What are you?", |
| "Who are you?", |
| "Do you have a name?", |
| "Who trained you?", |
| "Who created you?", |
| "Who made you?", |
| ] |
| answers = [ |
| "I'm h2oGPT, a large language model by H2O.ai.", |
| "I'm h2oGPT, a large language model by H2O.ai, the visionary leader in democratizing AI.", |
| "My name is h2oGPT. I'm a large language model by H2O.ai, the visionary leader in democratizing AI.", |
| "My name is h2oGPT. I'm a large language model trained by H2O.ai.", |
| "Hi! I'm h2oGPT, a large language model by H2O.ai.", |
| "Hi! I'm h2oGPT, a large language model by H2O.ai, the visionary leader in democratizing AI.", |
| ] |
| help = [ |
| "", |
| " How can I help you?", |
| " How may I assist you?", |
| " Nice to meet you.", |
| ] |
| import itertools |
| rows = [] |
| for pair in itertools.product(questions, answers, help): |
| rows.append( |
| dict(input=f"{pair[0]}", output=f"{pair[1]}{pair[2]}", prompt_type=prompt_type, source="H2O.ai") |
| ) |
| for q, a in [ |
| ("What is H2O.ai?", "H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models."), |
| ("What is h2o.ai?", "H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models."), |
| ("What is H2O?", "H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models."), |
| ("Who is h2o.ai?", "H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models."), |
| ("who is h2o.ai?", "H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models."), |
| ("who is h2o?", "H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models."), |
| ("what is H2O.ai?", "H2O.ai is the visionary leader in democratizing AI."), |
| ("who is H2O.ai?", "H2O.ai is the visionary leader in democratizing AI."), |
| ("who is H2O?", "H2O.ai is the visionary leader in democratizing AI."), |
| ("Who is h20?", "H2O.ai is the visionary leader in democratizing AI."), |
| ]: |
| rows.append(dict(input=q, output=a, prompt_type=prompt_type, source='H2O.ai')) |
| print(len(rows)) |
| with open("h2ogpt-personality.json", "w") as f: |
| f.write(json.dumps(rows, indent=2)) |
| return rows |
|
|
|
|
| def test_check_stats_data(): |
| filename = 'h2ogpt-oig-oasst1-instruct-cleaned-v3.json' |
| df = pd.read_json(filename) |
|
|
| |
| df['char_count'] = df['input'].apply(lambda x: len(x)) |
| import matplotlib.pyplot as plt |
| plt.figure(figsize=(10, 10)) |
| plt.hist(df['char_count'], bins=100) |
| chars_avg = np.mean(df['char_count']) |
| chars_median = np.median(df['char_count']) |
| plt.title("char_count avg: %s median: %s" % (chars_avg, chars_median)) |
| plt.savefig('chars_hist.png') |
| plt.close() |
|
|
| |
| from finetune import generate_and_tokenize_prompt |
| from loaders import get_loaders, get_tokenizer |
| from functools import partial |
|
|
| llama_type = False |
| tokenizer_base_model = base_model = 'h2oai/h2ogpt-oasst1-512-20b' |
| model_loader, tokenizer_loader, conditional_type = ( |
| get_loaders(model_name=base_model, reward_type=False, llama_type=llama_type)) |
| local_files_only = False |
| resume_download = True |
| use_auth_token = False |
| tokenizer = get_tokenizer(tokenizer_loader, tokenizer_base_model, local_files_only, resume_download, use_auth_token) |
| prompt_type = 'plain' |
| train_on_inputs = True |
| add_eos_token = False |
| cutoff_len = 512 |
| generate_and_tokenize_prompt_fun = partial(generate_and_tokenize_prompt, prompt_type=prompt_type, |
| train_on_inputs=train_on_inputs, add_eos_token=add_eos_token, |
| cutoff_len=cutoff_len, tokenizer=tokenizer) |
| from datasets import load_dataset |
| data = load_dataset("json", data_files={"train": filename}) |
| val_set_size = 0.90 |
| train_val = data["train"].train_test_split( |
| test_size=val_set_size, shuffle=True, seed=42 |
| ) |
| train_data = train_val["train"] |
| train_data = train_data.shuffle().map(generate_and_tokenize_prompt_fun, num_proc=os.cpu_count()) |
|
|
| df_tokens = pd.DataFrame([len(x) for x in train_data['input_ids']], columns=['token_count']) |
|
|
| plt.figure(figsize=(10, 10)) |
| plt.hist(df_tokens['token_count'], bins=100) |
| token_avg = np.mean(df_tokens['token_count']) |
| token_median = np.median(df_tokens['token_count']) |
| plt.title("token_count with cutoff=%s avg: %s median: %s" % (cutoff_len, token_avg, token_median)) |
| plt.savefig('token_hist_%s.png' % cutoff_len) |
| plt.close() |
|
|
|
|
| def get_unhelpful_list(): |
| |
| unhelpful = ["I'm sorry, I didn't quite understand your question, could you please rephrase it?", |
| "I'm sorry, but I don't understand your question. Could you please rephrase it?", |
| "I'm sorry, I don't quite understand your question", |
| "I'm sorry, I don't know", |
| "I'm sorry, but I don't know", |
| "I don't know anything", |
| "I do not know", |
| "I don't know", |
| "I don't know how", |
| "I do not know how", |
| "Can you please explain what you mean", |
| "please explain what you mean", |
| "please explain", |
| "I'm sorry, but I don't know how to tell a story. Can you please explain what you mean by", |
| "I'm sorry but I don't understand what you mean", |
| "I don't understand", |
| "I don't have the ability", |
| "I do not have the ability", |
| "I do not have", |
| "I am a language model,", |
| "I am a large language model,", |
| "I do not understand your question. Can you please try to make it clearer?", |
| "I'm sorry, but as an AI language model", |
| "I apologize, but I cannot rephrase text that I cannot understand. Your post is difficult to read and follow.", |
| "I apologize, but I am not h2oGPT. I am a language model developed by H2O.ai. How may I help you?", |
| "Sorry, but I am not an actual Linux shell, nor am I capable of emulating one. I am an open source chat assistant and would be glad t", |
| "I apologize, but I cannot perform the task you have requested.", |
| "I'm sorry, I cannot perform this task as I am an AI language model and do not have access", |
| "I'm sorry, I'm not sure what you're asking for here.", |
| "I'm not sure what you are asking", |
| "You need to provide more context", |
| ] |
| |
| unhelpful += ["sorry, I didn't quite understand your question", |
| "I didn't quite understand your question", |
| "I didn't understand your question", |
| "I did not understand your question", |
| "I did not understand the question", |
| "could you please rephrase" |
| "could you rephrase" |
| "I do not understand your question.", |
| "I do not understand the question.", |
| "I do not understand that question.", |
| "Can you please try to make it clearer", |
| "Can you try to make it clearer", |
| "sorry, but as an AI language model", |
| "as an AI language model", |
| "I apologize, but I cannot", |
| "I cannot rephrase text", |
| "I cannot understand. Your post is difficult to read and follow." |
| "Your post is difficult to read and follow." |
| "I apologize, but I am", |
| "Sorry, but I am not ", |
| "nor am I capable", |
| "I am not capable of", |
| "I apologize, but I cannot perform the task you have requested", |
| "I cannot perform the task", |
| "I cannot complete the task", |
| "I'm sorry", |
| "I am sorry", |
| "do not have access", |
| "not sure what you're asking for", |
| "not sure what you are asking for", |
| "not sure what is being asked", |
| "I'm not sure what you are asking", |
| "not sure what you are asking", |
| "You need to provide more context", |
| "provide more context", |
| ] |
| unhelpful += ["As a large language model", |
| "cannot provide any information", |
| "As an artificial intelligence I do not have the capability", |
| "As an artificial intelligence I don't have the capability", |
| "As an artificial intelligence I can't", |
| "As an artificial intelligence I cannot", |
| "I am sorry but I do not understand", |
| "Can you please explain", |
| "(sorry couldn't resist)", |
| "(sorry could not resist)", |
| " :)", |
| " ;)", |
| " :-)", |
| " ;-)", |
| " lol ", |
| "Thanks so much!!!", |
| "Thank You :)!!!", |
| "Please try not to repeat", |
| "I am an AI language model", |
| "I'm a AI assistant that", |
| "I'm an AI assistant that", |
| "I am an AI assistant that", |
| "etc.", |
| "etc.etc.", |
| "etc. etc.", |
| "etc etc", |
| ] |
| return unhelpful |
|
|
|
|
| def test_check_unhelpful(): |
| |
| file = '/home/jon/Downloads/openassistant_oasst1_h2ogpt_grades.json' |
| |
|
|
| unhelpful = get_unhelpful_list() |
| |
| df = pd.read_json(file) |
|
|
| use_reward_score_threshold = False |
| use_bleu_threshold = False |
| use_sentence_sim = True |
|
|
| from sacrebleu.metrics import BLEU |
| bleu = BLEU() |
| from nltk.translate.bleu_score import sentence_bleu |
|
|
| def get_bleu(actual, expected_list): |
| |
| return sentence_bleu(expected_list, actual) |
|
|
| threshold = 0.0 |
| if use_reward_score_threshold: |
| df = df[df['grade_deberta'] > threshold] |
|
|
| |
| data = df.to_dict(orient='records') |
| bads = {} |
| string_all = str(data) |
| for sub in unhelpful: |
| bads[sub] = string_all.count(sub) |
| bads = {k: v for k, v in bads.items() if v > 0} |
| import pprint |
| pp = pprint.PrettyPrinter(indent=4) |
| pp.pprint(bads) |
|
|
| total_bads = sum(list(bads.values())) |
| print('total_bads: %s' % total_bads, flush=True) |
|
|
| |
| import re |
| convs = [[x.strip() for x in re.split(r'%s|%s' % (human, bot), y['input']) if x.strip()] for y in data] |
| humans = [[x for i, x in enumerate(y) if i % 2 == 0] for y in convs] |
| bots = [[x for i, x in enumerate(y) if i % 2 == 1] for y in convs] |
|
|
| |
| bleu_threshold = 0.9 |
| if use_bleu_threshold: |
| bots = [[x for x in y if get_bleu(x, unhelpful) < bleu_threshold] for y in tqdm(bots)] |
|
|
| cosine_sim_threshold = 0.8 |
| if use_sentence_sim: |
| |
| from sentence_transformers import SentenceTransformer |
| |
| |
| sent_model = 'all-MiniLM-L6-v2' |
| model = SentenceTransformer(sent_model) |
| sentence_embeddings = model.encode(unhelpful) |
| from sklearn.metrics.pairwise import cosine_similarity |
| bots = [x for x in tqdm(bots) if |
| np.max(cosine_similarity(model.encode(x), sentence_embeddings)) < cosine_sim_threshold] |
|
|
| bads_bots = {} |
| string_all = str(bots) |
| for sub in unhelpful: |
| bads_bots[sub] = string_all.count(sub) |
| bads_bots = {k: v for k, v in bads_bots.items() if v > 0} |
| import pprint |
| pp = pprint.PrettyPrinter(indent=4) |
| pp.pprint(bads_bots) |
|
|
| total_bads_bots = sum(list(bads_bots.values())) |
| print('threshold: %g use_bleu_threshold: %g total_bads_bots: %s total_bots: %s total_humans: %s' % ( |
| threshold, use_bleu_threshold, total_bads_bots, len(bots), len(humans)), flush=True) |
|
|
| |
| assert len(bads_bots) == 0, bads_bots |
|
|
|
|
| def test_fortune2000_personalized(): |
| row_list = [] |
| import glob |
| if not os.path.isdir("wikitext"): |
| raise RuntimeError("download https://github.com/h2oai/h2ogpt/files/11423008/wikitext.zip and unzip") |
| for file in glob.glob("wikitext/*.txt"): |
| with open(file, "r") as f: |
| blob = f.read() |
| N = 512 * 4 |
| row_list.extend([{'input': s, 'prompt_type': 'plain', 'source': "%s" % os.path.basename(file)} |
| for s in get_sentences(blob, N) if s]) |
| personality = create_personality_data() |
| import copy |
| for i in range(10): |
| row_list.extend(copy.deepcopy(personality)) |
| np.random.seed(123) |
| np.random.shuffle(row_list) |
| for i in range(len(row_list)): |
| row_list[i]['id'] = i |
| for i in range(len(row_list)): |
| assert row_list[i]['id'] == i |
| with open("h2ogpt-fortune2000-personalized.json", "w") as ff: |
| ff.write(json.dumps(row_list, indent=2)) |
|
|