code-search-net/code_search_net
Viewer • Updated • 4.14M • 25.4k • 329
How to use BoghdadyJR/al-MiniLM-L6-v2 with sentence-transformers:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("BoghdadyJR/al-MiniLM-L6-v2")
sentences = [
"KeypointsOnImage.to_xy_array",
"def to_xy_array(self):\n \"\"\"\n Convert keypoint coordinates to ``(N,2)`` array.\n\n Returns\n -------\n (N, 2) ndarray\n Array containing the coordinates of all keypoints.\n Shape is ``(N,2)`` with coordinates in xy-form.\n\n \"\"\"\n result = np.zeros((len(self.keypoints), 2), dtype=np.float32)\n for i, keypoint in enumerate(self.keypoints):\n result[i, 0] = keypoint.x\n result[i, 1] = keypoint.y\n return result",
"def _generateMetricSpecs(options):\n \"\"\" Generates the Metrics for a given InferenceType\n\n Parameters:\n -------------------------------------------------------------------------\n options: ExpGenerator options\n retval: (metricsList, optimizeMetricLabel)\n metricsList: list of metric string names\n optimizeMetricLabel: Name of the metric which to optimize over\n\n \"\"\"\n inferenceType = options['inferenceType']\n inferenceArgs = options['inferenceArgs']\n predictionSteps = inferenceArgs['predictionSteps']\n metricWindow = options['metricWindow']\n if metricWindow is None:\n metricWindow = int(Configuration.get(\"nupic.opf.metricWindow\"))\n\n metricSpecStrings = []\n optimizeMetricLabel = \"\"\n\n # -----------------------------------------------------------------------\n # Generate the metrics specified by the expGenerator paramters\n metricSpecStrings.extend(_generateExtraMetricSpecs(options))\n\n # -----------------------------------------------------------------------\n\n optimizeMetricSpec = None\n # If using a dynamically computed prediction steps (i.e. when swarming\n # over aggregation is requested), then we will plug in the variable\n # predictionSteps in place of the statically provided predictionSteps\n # from the JSON description.\n if options['dynamicPredictionSteps']:\n assert len(predictionSteps) == 1\n predictionSteps = ['$REPLACE_ME']\n\n # -----------------------------------------------------------------------\n # Metrics for temporal prediction\n if inferenceType in (InferenceType.TemporalNextStep,\n InferenceType.TemporalAnomaly,\n InferenceType.TemporalMultiStep,\n InferenceType.NontemporalMultiStep,\n InferenceType.NontemporalClassification,\n 'MultiStep'):\n\n predictedFieldName, predictedFieldType = _getPredictedField(options)\n isCategory = _isCategory(predictedFieldType)\n metricNames = ('avg_err',) if isCategory else ('aae', 'altMAPE')\n trivialErrorMetric = 'avg_err' if isCategory else 'altMAPE'\n oneGramErrorMetric = 'avg_err' if isCategory else 'altMAPE'\n movingAverageBaselineName = 'moving_mode' if isCategory else 'moving_mean'\n\n # Multi-step metrics\n for metricName in metricNames:\n metricSpec, metricLabel = \\\n _generateMetricSpecString(field=predictedFieldName,\n inferenceElement=InferenceElement.multiStepBestPredictions,\n metric='multiStep',\n params={'errorMetric': metricName,\n 'window':metricWindow,\n 'steps': predictionSteps},\n returnLabel=True)\n metricSpecStrings.append(metricSpec)\n\n # If the custom error metric was specified, add that\n if options[\"customErrorMetric\"] is not None :\n metricParams = dict(options[\"customErrorMetric\"])\n metricParams['errorMetric'] = 'custom_error_metric'\n metricParams['steps'] = predictionSteps\n # If errorWindow is not specified, make it equal to the default window\n if not \"errorWindow\" in metricParams:\n metricParams[\"errorWindow\"] = metricWindow\n metricSpec, metricLabel =_generateMetricSpecString(field=predictedFieldName,\n inferenceElement=InferenceElement.multiStepPredictions,\n metric=\"multiStep\",\n params=metricParams,\n returnLabel=True)\n metricSpecStrings.append(metricSpec)\n\n # If this is the first specified step size, optimize for it. Be sure to\n # escape special characters since this is a regular expression\n optimizeMetricSpec = metricSpec\n metricLabel = metricLabel.replace('[', '\\\\[')\n metricLabel = metricLabel.replace(']', '\\\\]')\n optimizeMetricLabel = metricLabel\n\n if options[\"customErrorMetric\"] is not None :\n optimizeMetricLabel = \".*custom_error_metric.*\"\n\n # Add in the trivial metrics\n if options[\"runBaselines\"] \\\n and inferenceType != InferenceType.NontemporalClassification:\n for steps in predictionSteps:\n metricSpecStrings.append(\n _generateMetricSpecString(field=predictedFieldName,\n inferenceElement=InferenceElement.prediction,\n metric=\"trivial\",\n params={'window':metricWindow,\n \"errorMetric\":trivialErrorMetric,\n 'steps': steps})\n )\n\n ##Add in the One-Gram baseline error metric\n #metricSpecStrings.append(\n # _generateMetricSpecString(field=predictedFieldName,\n # inferenceElement=InferenceElement.encodings,\n # metric=\"two_gram\",\n # params={'window':metricWindow,\n # \"errorMetric\":oneGramErrorMetric,\n # 'predictionField':predictedFieldName,\n # 'steps': steps})\n # )\n #\n #Include the baseline moving mean/mode metric\n if isCategory:\n metricSpecStrings.append(\n _generateMetricSpecString(field=predictedFieldName,\n inferenceElement=InferenceElement.prediction,\n metric=movingAverageBaselineName,\n params={'window':metricWindow\n ,\"errorMetric\":\"avg_err\",\n \"mode_window\":200,\n \"steps\": steps})\n )\n else :\n metricSpecStrings.append(\n _generateMetricSpecString(field=predictedFieldName,\n inferenceElement=InferenceElement.prediction,\n metric=movingAverageBaselineName,\n params={'window':metricWindow\n ,\"errorMetric\":\"altMAPE\",\n \"mean_window\":200,\n \"steps\": steps})\n )\n\n\n\n\n # -----------------------------------------------------------------------\n # Metrics for classification\n elif inferenceType in (InferenceType.TemporalClassification):\n\n metricName = 'avg_err'\n trivialErrorMetric = 'avg_err'\n oneGramErrorMetric = 'avg_err'\n movingAverageBaselineName = 'moving_mode'\n\n optimizeMetricSpec, optimizeMetricLabel = \\\n _generateMetricSpecString(inferenceElement=InferenceElement.classification,\n metric=metricName,\n params={'window':metricWindow},\n returnLabel=True)\n\n metricSpecStrings.append(optimizeMetricSpec)\n\n if options[\"runBaselines\"]:\n # If temporal, generate the trivial predictor metric\n if inferenceType == InferenceType.TemporalClassification:\n metricSpecStrings.append(\n _generateMetricSpecString(inferenceElement=InferenceElement.classification,\n metric=\"trivial\",\n params={'window':metricWindow,\n \"errorMetric\":trivialErrorMetric})\n )\n metricSpecStrings.append(\n _generateMetricSpecString(inferenceElement=InferenceElement.classification,\n metric=\"two_gram\",\n params={'window':metricWindow,\n \"errorMetric\":oneGramErrorMetric})\n )\n metricSpecStrings.append(\n _generateMetricSpecString(inferenceElement=InferenceElement.classification,\n metric=movingAverageBaselineName,\n params={'window':metricWindow\n ,\"errorMetric\":\"avg_err\",\n \"mode_window\":200})\n )\n\n\n # Custom Error Metric\n if not options[\"customErrorMetric\"] == None :\n #If errorWindow is not specified, make it equal to the default window\n if not \"errorWindow\" in options[\"customErrorMetric\"]:\n options[\"customErrorMetric\"][\"errorWindow\"] = metricWindow\n optimizeMetricSpec = _generateMetricSpecString(\n inferenceElement=InferenceElement.classification,\n metric=\"custom\",\n params=options[\"customErrorMetric\"])\n optimizeMetricLabel = \".*custom_error_metric.*\"\n\n metricSpecStrings.append(optimizeMetricSpec)\n\n\n # -----------------------------------------------------------------------\n # If plug in the predictionSteps variable for any dynamically generated\n # prediction steps\n if options['dynamicPredictionSteps']:\n for i in range(len(metricSpecStrings)):\n metricSpecStrings[i] = metricSpecStrings[i].replace(\n \"'$REPLACE_ME'\", \"predictionSteps\")\n optimizeMetricLabel = optimizeMetricLabel.replace(\n \"'$REPLACE_ME'\", \".*\")\n return metricSpecStrings, optimizeMetricLabel",
"def create_perf_attrib_stats(perf_attrib, risk_exposures):\n \"\"\"\n Takes perf attribution data over a period of time and computes annualized\n multifactor alpha, multifactor sharpe, risk exposures.\n \"\"\"\n summary = OrderedDict()\n total_returns = perf_attrib['total_returns']\n specific_returns = perf_attrib['specific_returns']\n common_returns = perf_attrib['common_returns']\n\n summary['Annualized Specific Return'] =\\\n ep.annual_return(specific_returns)\n summary['Annualized Common Return'] =\\\n ep.annual_return(common_returns)\n summary['Annualized Total Return'] =\\\n ep.annual_return(total_returns)\n\n summary['Specific Sharpe Ratio'] =\\\n ep.sharpe_ratio(specific_returns)\n\n summary['Cumulative Specific Return'] =\\\n ep.cum_returns_final(specific_returns)\n summary['Cumulative Common Return'] =\\\n ep.cum_returns_final(common_returns)\n summary['Total Returns'] =\\\n ep.cum_returns_final(total_returns)\n\n summary = pd.Series(summary, name='')\n\n annualized_returns_by_factor = [ep.annual_return(perf_attrib[c])\n for c in risk_exposures.columns]\n cumulative_returns_by_factor = [ep.cum_returns_final(perf_attrib[c])\n for c in risk_exposures.columns]\n\n risk_exposure_summary = pd.DataFrame(\n data=OrderedDict([\n (\n 'Average Risk Factor Exposure',\n risk_exposures.mean(axis='rows')\n ),\n ('Annualized Return', annualized_returns_by_factor),\n ('Cumulative Return', cumulative_returns_by_factor),\n ]),\n index=risk_exposures.columns,\n )\n\n return summary, risk_exposure_summary"
]
embeddings = model.encode(sentences)
similarities = model.similarity(embeddings, embeddings)
print(similarities.shape)
# [4, 4]This is a sentence-transformers model finetuned from sentence-transformers/all-mpnet-base-v2 on the code-search-net/code_search_net dataset. It maps sentences & paragraphs to a 768-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
SentenceTransformer(
(0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel
(1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
(2): Normalize()
)
First install the Sentence Transformers library:
pip install -U sentence-transformers
Then you can load this model and run inference.
from sentence_transformers import SentenceTransformer
# Download from the 🤗 Hub
model = SentenceTransformer("BoghdadyJR/al-MiniLM-L6-v2")
# Run inference
sentences = [
'Keypoint.copy',
'def copy(self, x=None, y=None):\n """\n Create a shallow copy of the Keypoint object.\n\n Parameters\n ----------\n x : None or number, optional\n Coordinate of the keypoint on the x axis.\n If ``None``, the instance\'s value will be copied.\n\n y : None or number, optional\n Coordinate of the keypoint on the y axis.\n If ``None``, the instance\'s value will be copied.\n\n Returns\n -------\n imgaug.Keypoint\n Shallow copy.\n\n """\n return self.deepcopy(x=x, y=y)',
'def build_words_dataset(words=None, vocabulary_size=50000, printable=True, unk_key=\'UNK\'):\n """Build the words dictionary and replace rare words with \'UNK\' token.\n The most common word has the smallest integer id.\n\n Parameters\n ----------\n words : list of str or byte\n The context in list format. You may need to do preprocessing on the words, such as lower case, remove marks etc.\n vocabulary_size : int\n The maximum vocabulary size, limiting the vocabulary size. Then the script replaces rare words with \'UNK\' token.\n printable : boolean\n Whether to print the read vocabulary size of the given words.\n unk_key : str\n Represent the unknown words.\n\n Returns\n --------\n data : list of int\n The context in a list of ID.\n count : list of tuple and list\n Pair words and IDs.\n - count[0] is a list : the number of rare words\n - count[1:] are tuples : the number of occurrence of each word\n - e.g. [[\'UNK\', 418391], (b\'the\', 1061396), (b\'of\', 593677), (b\'and\', 416629), (b\'one\', 411764)]\n dictionary : dictionary\n It is `word_to_id` that maps word to ID.\n reverse_dictionary : a dictionary\n It is `id_to_word` that maps ID to word.\n\n Examples\n --------\n >>> words = tl.files.load_matt_mahoney_text8_dataset()\n >>> vocabulary_size = 50000\n >>> data, count, dictionary, reverse_dictionary = tl.nlp.build_words_dataset(words, vocabulary_size)\n\n References\n -----------------\n - `tensorflow/examples/tutorials/word2vec/word2vec_basic.py <https://github.com/tensorflow/tensorflow/blob/r0.7/tensorflow/examples/tutorials/word2vec/word2vec_basic.py>`__\n\n """\n if words is None:\n raise Exception("words : list of str or byte")\n\n count = [[unk_key, -1]]\n count.extend(collections.Counter(words).most_common(vocabulary_size - 1))\n dictionary = dict()\n for word, _ in count:\n dictionary[word] = len(dictionary)\n data = list()\n unk_count = 0\n for word in words:\n if word in dictionary:\n index = dictionary[word]\n else:\n index = 0 # dictionary[\'UNK\']\n unk_count += 1\n data.append(index)\n count[0][1] = unk_count\n reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))\n if printable:\n tl.logging.info(\'Real vocabulary size %d\' % len(collections.Counter(words).keys()))\n tl.logging.info(\'Limited vocabulary size {}\'.format(vocabulary_size))\n if len(collections.Counter(words).keys()) < vocabulary_size:\n raise Exception(\n "len(collections.Counter(words).keys()) >= vocabulary_size , the limited vocabulary_size must be less than or equal to the read vocabulary_size"\n )\n return data, count, dictionary, reverse_dictionary',
]
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 768]
# Get the similarity scores for the embeddings
similarities = model.similarity(embeddings, embeddings)
print(similarities.shape)
# [3, 3]
sts-devEmbeddingSimilarityEvaluator| Metric | Value |
|---|---|
| pearson_cosine | 0.8806 |
| spearman_cosine | 0.881 |
| pearson_manhattan | 0.8781 |
| spearman_manhattan | 0.8798 |
| pearson_euclidean | 0.8794 |
| spearman_euclidean | 0.881 |
| pearson_dot | 0.8806 |
| spearman_dot | 0.881 |
| pearson_max | 0.8806 |
| spearman_max | 0.881 |
func_name and whole_func_string| func_name | whole_func_string | |
|---|---|---|
| type | string | string |
| details |
|
|
| func_name | whole_func_string |
|---|---|
ImageGraphCut.__msgc_step3_discontinuity_localization |
def __msgc_step3_discontinuity_localization(self): |
ImageGraphCut.__multiscale_gc_lo2hi_run |
def __multiscale_gc_lo2hi_run(self): # , pyed): |
ImageGraphCut.__multiscale_gc_hi2lo_run |
def __multiscale_gc_hi2lo_run(self): # , pyed): |
MultipleNegativesRankingLoss with these parameters:{
"scale": 20.0,
"similarity_fct": "cos_sim"
}
func_name and whole_func_string| func_name | whole_func_string | |
|---|---|---|
| type | string | string |
| details |
|
|
| func_name | whole_func_string |
|---|---|
learn |
def learn(env, |
ActWrapper.save_act |
def save_act(self, path=None): |
nature_cnn |
def nature_cnn(unscaled_images, **conv_kwargs): |
MultipleNegativesRankingLoss with these parameters:{
"scale": 20.0,
"similarity_fct": "cos_sim"
}
eval_strategy: stepsper_device_train_batch_size: 16per_device_eval_batch_size: 16learning_rate: 2e-05num_train_epochs: 1warmup_ratio: 0.1fp16: Truebatch_sampler: no_duplicatesoverwrite_output_dir: Falsedo_predict: Falseeval_strategy: stepsprediction_loss_only: Trueper_device_train_batch_size: 16per_device_eval_batch_size: 16per_gpu_train_batch_size: Noneper_gpu_eval_batch_size: Nonegradient_accumulation_steps: 1eval_accumulation_steps: Nonelearning_rate: 2e-05weight_decay: 0.0adam_beta1: 0.9adam_beta2: 0.999adam_epsilon: 1e-08max_grad_norm: 1.0num_train_epochs: 1max_steps: -1lr_scheduler_type: linearlr_scheduler_kwargs: {}warmup_ratio: 0.1warmup_steps: 0log_level: passivelog_level_replica: warninglog_on_each_node: Truelogging_nan_inf_filter: Truesave_safetensors: Truesave_on_each_node: Falsesave_only_model: Falserestore_callback_states_from_checkpoint: Falseno_cuda: Falseuse_cpu: Falseuse_mps_device: Falseseed: 42data_seed: Nonejit_mode_eval: Falseuse_ipex: Falsebf16: Falsefp16: Truefp16_opt_level: O1half_precision_backend: autobf16_full_eval: Falsefp16_full_eval: Falsetf32: Nonelocal_rank: 0ddp_backend: Nonetpu_num_cores: Nonetpu_metrics_debug: Falsedebug: []dataloader_drop_last: Falsedataloader_num_workers: 0dataloader_prefetch_factor: Nonepast_index: -1disable_tqdm: Falseremove_unused_columns: Truelabel_names: Noneload_best_model_at_end: Falseignore_data_skip: Falsefsdp: []fsdp_min_num_params: 0fsdp_config: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}fsdp_transformer_layer_cls_to_wrap: Noneaccelerator_config: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}deepspeed: Nonelabel_smoothing_factor: 0.0optim: adamw_torchoptim_args: Noneadafactor: Falsegroup_by_length: Falselength_column_name: lengthddp_find_unused_parameters: Noneddp_bucket_cap_mb: Noneddp_broadcast_buffers: Falsedataloader_pin_memory: Truedataloader_persistent_workers: Falseskip_memory_metrics: Trueuse_legacy_prediction_loop: Falsepush_to_hub: Falseresume_from_checkpoint: Nonehub_model_id: Nonehub_strategy: every_savehub_private_repo: Falsehub_always_push: Falsegradient_checkpointing: Falsegradient_checkpointing_kwargs: Noneinclude_inputs_for_metrics: Falseeval_do_concat_batches: Truefp16_backend: autopush_to_hub_model_id: Nonepush_to_hub_organization: Nonemp_parameters: auto_find_batch_size: Falsefull_determinism: Falsetorchdynamo: Noneray_scope: lastddp_timeout: 1800torch_compile: Falsetorch_compile_backend: Nonetorch_compile_mode: Nonedispatch_batches: Nonesplit_batches: Noneinclude_tokens_per_second: Falseinclude_num_input_tokens_seen: Falseneftune_noise_alpha: Noneoptim_target_modules: Nonebatch_eval_metrics: Falseeval_on_start: Falsebatch_sampler: no_duplicatesmulti_dataset_batch_sampler: proportional| Epoch | Step | Training Loss | loss | sts-dev_spearman_cosine |
|---|---|---|---|---|
| 0 | 0 | - | - | 0.8810 |
| 0.08 | 100 | 0.4124 | 0.2191 | - |
| 0.16 | 200 | 0.108 | 0.0993 | - |
| 0.24 | 300 | 0.127 | 0.0756 | - |
| 0.32 | 400 | 0.0728 | - | - |
| 0.08 | 100 | 0.0662 | 0.0683 | - |
| 0.16 | 200 | 0.0321 | 0.0660 | - |
| 0.24 | 300 | 0.0815 | 0.0584 | - |
| 0.32 | 400 | 0.049 | 0.0591 | - |
| 0.4 | 500 | 0.0636 | 0.0612 | - |
| 0.48 | 600 | 0.0929 | 0.0577 | - |
| 0.56 | 700 | 0.0342 | 0.0568 | - |
| 0.64 | 800 | 0.0265 | 0.0572 | - |
| 0.72 | 900 | 0.0406 | 0.0551 | - |
| 0.8 | 1000 | 0.039 | 0.0549 | - |
| 0.88 | 1100 | 0.0376 | 0.0551 | - |
| 0.96 | 1200 | 0.0823 | 0.0556 | - |
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "https://arxiv.org/abs/1908.10084",
}
@misc{henderson2017efficient,
title={Efficient Natural Language Response Suggestion for Smart Reply},
author={Matthew Henderson and Rami Al-Rfou and Brian Strope and Yun-hsuan Sung and Laszlo Lukacs and Ruiqi Guo and Sanjiv Kumar and Balint Miklos and Ray Kurzweil},
year={2017},
eprint={1705.00652},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
Base model
sentence-transformers/all-mpnet-base-v2