| import gradio as gr |
| import languagecodes |
| import polars as pl |
|
|
| df = pl.read_parquet("isolanguages.parquet") |
| non_empty_isos = df.slice(1).filter(pl.col("ISO639-1") != "").rows() |
| |
| all_langs = {iso[0]: (iso[1], iso[2], iso[3]) for iso in non_empty_isos} |
| iso1toall = {iso[1]: (iso[0], iso[2], iso[3]) for iso in non_empty_isos} |
| DEFAULTS = None |
| |
| libraries = ["stanza", "langdetect", "py3langid", "langid", "lingua-py", "pycld2", "fastlangdetect", "fasttext", "openlid", "glotlid"] |
|
|
| class Detect(): |
| def __init__(self, text: str) -> None: |
| self.text: str = text |
| def stanza(self) -> list[str, float]: |
| from stanza.models.common.doc import Document |
| from stanza.pipeline.core import Pipeline |
| stanza_detect_langs = ['af', 'ar', 'be', 'bg', 'bxr', 'ca', 'cop', 'cs', 'cu', 'da', 'de', 'el', |
| 'en', 'es', 'et', 'eu', 'fa', 'fi', 'fr', 'fro', 'ga', 'gd', 'gl', 'got', |
| 'grc', 'he', 'hi', 'hr', 'hsb', 'hu', 'hy', 'id', 'it', 'ja', 'kk', 'kmr', |
| 'ko', 'la', 'lt', 'lv', 'lzh', 'mr', 'mt', 'nl', 'nn', 'no', 'olo', 'orv', |
| 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sme', 'sr', 'sv', 'swl', 'ta', 'te', |
| 'tr', 'ug', 'uk', 'ur', 'vi', 'wo', 'zh-hans', 'zh-hant'] |
| docs = Document([], text=self.text) |
| nlp = Pipeline(lang="multilingual", processors="langid") |
| nlp(docs) |
| return [docs.lang, 100] |
| def langdetect(self) -> list[str, float]: |
| from langdetect import detect, detect_langs |
| from langdetect import DetectorFactory |
| DetectorFactory.seed = 0 |
| langcode = detect(self.text) |
| langecode_probabilities: list[Language] = detect_langs(self.text) |
| return [langcode, round(number=langecode_probabilities[0].prob * 100, ndigits=2)] |
| def langid(self) -> list[str, float]: |
| from langid.langid import LanguageIdentifier, model |
| identifier = LanguageIdentifier.from_modelstring(string=model, norm_probs=True) |
| idresult: list[str, float] = list(identifier.classify(self.text)) |
| return [idresult[0], abs(round(number=idresult[1] * 100, ndigits=2))] |
| def py3langid(self) -> list[str, float]: |
| langs = ["af", "am", "an", "ar", "as", "az", "be", "bg", "bn", "br", "bs", "ca", "cs", "cy", |
| "da", "de", "dz", "el", "en", "eo", "es", "et", "eu", "fa", "fi", "fo", "fr", "ga", |
| "gl", "gu", "he", "hi", "hr", "ht", "hu", "hy", "id", "is", "it", "ja", "jv", "ka", |
| "kk", "km", "kn", "ko", "ku", "ky", "la", "lb", "lo", "lt", "lv", "mg", "mk", "ml", |
| "mn", "mr", "ms", "mt", "nb", "ne", "nl", "nn", "no", "oc", "or", "pa", "pl", "ps", |
| "pt", "qu", "ro", "ru", "rw", "se", "si", "sk", "sl", "sq", "sr", "sv", "sw", "ta", |
| "te", "th", "tl", "tr", "ug", "uk", "ur", "vi", "vo", "wa", "xh", "zh", "zu"] |
| import py3langid |
| lang, prob = py3langid.classify(self.text) |
| return [lang, abs(round(number=prob * 100, ndigits=2))] |
| def linguapy(self) -> list[str, float]: |
| from lingua import Language, LanguageDetectorBuilder |
| detector: LanguageDetector = LanguageDetectorBuilder.from_all_languages().with_preloaded_language_models().build() |
| confidence_values: List[ConfidenceValue] = detector.compute_language_confidence_values(self.text) |
| return [confidence_values[0].language.iso_code_639_1.name.lower(), "{0:.2f}".format(confidence_values[0].value * 100)] |
| def fasttextlangdetect(self) -> list[str, float]: |
| from ftlangdetect import detect |
| result = detect(text=self.text, low_memory=False) |
| return [result.get('lang'), abs(round(number=result.get('score') * 100, ndigits=2))] |
| def fastlangdetect(self) -> list[str, float]: |
| from fast_langdetect import detect |
| result = detect(text=self.text, model="auto", k=1)[0] |
| return [result.get('lang'), abs(round(number=result.get('score') * 100, ndigits=2))] |
| def pycld2(self) -> list[str, float]: |
| import pycld2 as cld2 |
| |
| isReliable, textBytesFound, details = cld2.detect(self.text, returnVectors=False, bestEffort=True) |
| return [details[0][1], round(details[0][2], 2)] |
| def parse_fastext(self, repo_id, k=3): |
| import fasttext |
| from huggingface_hub import hf_hub_download |
| model_path = hf_hub_download(repo_id=repo_id, filename="model.bin") |
| model = fasttext.load_model(model_path) |
| language, probabilities = model.predict(self.text, k=k) |
| reversed_nllb_langs = {v: k for k, v in languagecodes.nllb_language_codes.items()} |
| long_langname = reversed_nllb_langs[language[0].replace('__label__', '')] |
| lang_code = all_langs[long_langname][0] |
| return [lang_code, round(number=probabilities[0] * 100, ndigits=2)] |
| def fasttext(self) -> list[str, float]: |
| import fasttext |
| from huggingface_hub import hf_hub_download |
| model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin") |
| model = fasttext.load_model(model_path) |
| language, probabilities = model.predict(self.text, k=3) |
| reversed_nllb_langs = {v: k for k, v in languagecodes.nllb_language_codes.items()} |
| long_langname = reversed_nllb_langs[language[0].replace('__label__', '')] |
| lang_code = all_langs[long_langname][0] |
| return [lang_code, round(number=probabilities[0] * 100, ndigits=2)] |
| def openlid(self) -> list[str, float]: |
| import fasttext |
| from huggingface_hub import hf_hub_download |
| model_path = hf_hub_download(repo_id="laurievb/OpenLID-v2", filename="model.bin") |
| model = fasttext.load_model(model_path) |
| language, probabilities = model.predict(self.text, k=3) |
| reversed_nllb_langs = {v: k for k, v in languagecodes.nllb_language_codes.items()} |
| long_langname = reversed_nllb_langs[language[0].replace('__label__', '')] |
| lang_code = all_langs[long_langname][0] |
| return [lang_code, round(number=probabilities[0] * 100, ndigits=2)] |
| def glotlid(self) -> list[str, float]: |
| repo_id="cis-lmu/glotlid" |
| return self.parse_fastext(repo_id) |
|
|
| def detect_language(input_text: str, used_libraries: list[str]) -> tuple[str, str]: |
| """ |
| Detects the language of the input text. |
| |
| Parameters: |
| input_text (str): The source text to be translated |
| used_libraries: (list[str]) The libraries to be used for detection |
| Returns: |
| list of lists with: |
| detected_text(str): The language code of the input text |
| confidence(float): The confidence score as float |
| |
| Example: |
| >>> detect_language("Hello world", ["langdetect", "langid", "lingua-py", "fasttextlangdetect", "fastlangdetect"]) |
| [["en", 1.0]] |
| """ |
| detectinstance = Detect(input_text) |
| detections = [] |
| if not input_text or not used_libraries: |
| return [['No input text or library selected', 'Please provide input text and/or select a detection library']] |
| if 'stanza' in used_libraries: |
| detections.append(['stanza'] + detectinstance.stanza()) |
| if 'langdetect' in used_libraries: |
| detections.append(['langdetect'] + detectinstance.langdetect()) |
| if 'langid' in used_libraries: |
| detections.append(['langid'] + detectinstance.langid()) |
| if 'py3langid' in used_libraries: |
| detections.append(['py3langid'] + detectinstance.py3langid()) |
| if 'lingua-py' in used_libraries: |
| detections.append(['lingua-py'] + detectinstance.linguapy()) |
| if 'pycld2' in used_libraries: |
| detections.append(['pycld2'] + detectinstance.pycld2()) |
| if 'fastlangdetect' in used_libraries: |
| detections.append(['fastlangdetect'] + detectinstance.fastlangdetect()) |
| if 'fasttext' in used_libraries: |
| detections.append(['fasttext'] + detectinstance.fasttext()) |
| if 'openlid' in used_libraries: |
| detections.append(['openlid'] + detectinstance.openlid()) |
| if 'glotlid' in used_libraries: |
| detections.append(['glotlid'] + detectinstance.glotlid()) |
| unique_codes = list(set([x[1] for x in detections])) |
| unique_languages = [iso1toall[x][0] for x in unique_codes] |
| detections.append([f'Unique languages: {unique_languages}', f'Unique codes: {unique_codes}', f'Languages detected: {len(unique_codes)}']) |
| print(unique_codes, unique_languages, detections) |
| return detections |
| |
| with gr.Blocks() as interface: |
| gr.Markdown("### Language Detection with Gradio API and MCP Server") |
| input_text = gr.Textbox(label="Enter text to detect:", placeholder="Type/copy text here, maximum 512 characters", |
| autofocus=True, submit_btn='Detect Language', max_length=512) |
| with gr.Row(variant="compact"): |
| used_libraries = gr.CheckboxGroup(choices=libraries, value=libraries, label="Detection libraries", show_select_all=True) |
| dataframe = gr.Dataframe( |
| headers=["Library", "Language code", "Score"], |
| datatype=["str", "str", "number"], |
| type='array', |
| row_count=len(libraries), |
| column_count=3, |
| column_limits=(2, 4), |
| label='Language detection dataframe' |
| ) |
| input_text.submit( |
| fn=detect_language, |
| inputs=[input_text, used_libraries], |
| outputs=[dataframe] |
| ) |
| |
| if __name__ == "__main__": |
| interface.launch(mcp_server=True, footer_links=["api", "settings"]) |
| |