#!/usr/bin/env python # coding: utf-8 # # About # The goal of the _Indic NLP Library_ is to build Python based libraries for common text processing and Natural Language Processing in Indian languages. Indian languages share a lot of similarity in terms of script, phonology, language syntax, etc. and this library is an attempt to provide a general solution to very commonly required toolsets for Indian language text. # # The library provides the following functionalities: # # - Text Normalization # - Tokenization # - Sentence Splitter # - Script Conversion # - Romanization # - Indicization # - Script Information # - Phonetic Similarity # - Syllabification # - Word Segmenation # - Transliteration # - Translation # # The data resources required by the Indic NLP Library are hosted in a different repository. These resources are required for some modules. You can download from the [Indic NLP Resources](https://github.com/anoopkunchukuttan/indic_nlp_resources) project. # # # Pre-requisites # # - Python 3.5+ # - [Morfessor 2.0 Python Library](http://www.cis.hut.fi/projects/morpho/morfessor2.shtml) # # # Getting Started # **----- Set these variables -----** # In[1]: # The path to the local git repo for Indic NLP library INDIC_NLP_LIB_HOME=r"C:\Users\ankunchu\Documents\src\indic_nlp_library" # The path to the local git repo for Indic NLP Resources INDIC_NLP_RESOURCES=r"C:\Users\ankunchu\Documents\src\indic_nlp_resources" # **Add Library to Python path** # In[2]: import sys sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME)) # **Export environment variable** # # export INDIC_RESOURCES_PATH= # OR # # **set it programmatically** # We will use that method for this demo # In[4]: from indicnlp import common common.set_resources_path(INDIC_NLP_RESOURCES) # **Initialize the Indic NLP library** # In[5]: from indicnlp import loader loader.load() # ** Let's actually try out some of the API methods in the Indic NLP library ** # Many of the API functions require a language code. We use 2-letter ISO 639-1 codes. Some languages do not have assigned 2-letter codes. We use the following two-letter codes for such languages: # # - Konkani: kK # - Manipuri: mP # - Bodo: bD # # Examples for various API features # ## Text Normalization # # Text written in Indic scripts display a lot of quirky behaviour on account of varying input methods, multiple representations for the same character, etc. # There is a need to canonicalize the representation of text so that NLP applications can handle the data in a consistent manner. The canonicalization primarily handles the following issues: # # - Non-spacing characters like ZWJ/ZWNL # - Multiple representations of Nukta based characters # - Multiple representations of two part dependent vowel signs # - Typing inconsistencies: e.g. use of pipe (|) for poorna virama # # When data available is scarce, such normalization can help utilize the data more efficiently. # In[13]: from indicnlp.normalize.indic_normalize import IndicNormalizerFactory input_text="\u0958 \u0915\u093c" remove_nuktas=False factory=IndicNormalizerFactory() normalizer=factory.get_normalizer("hi",remove_nuktas) output_text=normalizer.normalize(input_text) print(input_text) print() print('Before normalization') print(' '.join([ hex(ord(c)) for c in input_text ] )) print('Length: {}'.format(len(input_text))) print() print('After normalization') print(' '.join([ hex(ord(c)) for c in output_text ] )) print('Length: {}'.format(len(output_text))) # ## Sentence Splitter # # A smart sentence splitter which uses a two-pass rule-based system to split the text into sentences. It knows of common prefixes in Indian languages. # In[15]: from indicnlp.tokenize import sentence_tokenize indic_string="""तो क्या विश्व कप 2019 में मैच का बॉस टॉस है? यानी मैच में हार-जीत में \ टॉस की भूमिका अहम है? आप ऐसा सोच सकते हैं। विश्वकप के अपने-अपने पहले मैच में बुरी तरह हारने वाली एशिया की दो टीमों \ पाकिस्तान और श्रीलंका के कप्तान ने हालांकि अपने हार के पीछे टॉस की दलील तो नहीं दी, लेकिन यह जरूर कहा था कि वह एक अहम टॉस हार गए थे।""" sentences=sentence_tokenize.sentence_split(indic_string, lang='hi') for t in sentences: print(t) # ## Tokenization # # A trivial tokenizer which just tokenizes on the punctuation boundaries. This also includes punctuations for the Indian language scripts (the purna virama and the deergha virama). It returns a list of tokens. # # In[20]: from indicnlp.tokenize import indic_tokenize indic_string='सुनो, कुछ आवाज़ आ रही है। फोन?' print('Input String: {}'.format(indic_string)) print('Tokens: ') for t in indic_tokenize.trivial_tokenize(indic_string): print(t) # ## De-tokenization # # A de-tokenizer for Indian languages that can address punctuation in Indic languages. The de-tokenizer is useful when generating natural language output. It can be used as a post-processor. # # In[16]: from indicnlp.tokenize import indic_detokenize indic_string='" सुनो , कुछ आवाज़ आ रही है . " , उसने कहा । ' print('Input String: {}'.format(indic_string)) print('Detokenized String: {}'.format(indic_detokenize.trivial_detokenize(indic_string,lang='hi'))) # ## Script Conversion # # Convert from one Indic script to another. This is a simple script which exploits the fact that Unicode points of various Indic scripts are at corresponding offsets from the base codepoint for that script. The following scripts are supported: # # _Devanagari (Hindi,Marathi,Sanskrit,Konkani,Sindhi,Nepali), Assamese, Bengali, Oriya, Gujarati, Gurumukhi (Punjabi), Sindhi, Tamil, Telugu, Kannada, Malayalam_ # In[8]: from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator input_text='राजस्थान' # input_text='രാജസ്ഥാന' # input_text='රාජස්ථාන' print(UnicodeIndicTransliterator.transliterate(input_text,"hi","ta")) # ## Romanization # # Convert script text to Roman text in the ITRANS notation # In[13]: from indicnlp.transliterate.unicode_transliterate import ItransTransliterator input_text='राजस्थान' # input_text='ஆசிரியர்கள்' lang='hi' print(ItransTransliterator.to_itrans(input_text,lang)) # ## Indicization (ITRANS to Indic Script) # # Let's call conversion of ITRANS-transliteration to an Indic script as **Indicization**! # # In[14]: from indicnlp.transliterate.unicode_transliterate import ItransTransliterator input_text='pAlakkAda' # input_text='pitL^In' lang='ml' x=ItransTransliterator.from_itrans(input_text,lang) print(x) for y in x: print('{:x}'.format(ord(y))) # ## Script Information # # Indic scripts have been designed keeping phonetic principles in nature and the design and organization of the scripts makes it easy to obtain phonetic information about the characters. # ### Get Phonetic Feature Vector # # With each script character, a phontic feature vector is associated, which encodes the phontic properties of the character. This is a bit vector which is can be obtained as shown below: # In[28]: from indicnlp.script import indic_scripts as isc c='क' lang='hi' isc.get_phonetic_feature_vector(c,lang) # This fields in this bit vector are (from left to right): # In[30]: sorted(isc.PV_PROP_RANGES.items(),key=lambda x:x[1][0]) # You can check the phonetic information database files in Indic NLP resources to know the definition of each of the bits. # # - _For Tamil Script_: [database](https://github.com/anoopkunchukuttan/indic_nlp_resources/blob/master/script/tamil_script_phonetic_data.csv) # - _For other Indic Scripts_: [database](https://github.com/anoopkunchukuttan/indic_nlp_resources/blob/master/script/all_script_phonetic_data.csv) # ### Query Phonetic Properties # # **Note:** _This interface below will be deprecated soon and a new interface will be available soon._ # In[31]: from indicnlp.langinfo import * c='क' lang='hi' print('Is vowel?: {}'.format(is_vowel(c,lang))) print('Is consonant?: {}'.format(is_consonant(c,lang))) print('Is velar?: {}'.format(is_velar(c,lang))) print('Is palatal?: {}'.format(is_palatal(c,lang))) print('Is aspirated?: {}'.format(is_aspirated(c,lang))) print('Is unvoiced?: {}'.format(is_unvoiced(c,lang))) print('Is nasal?: {}'.format(is_nasal(c,lang))) # ### Get Phonetic Similarity # # Using the phonetic feature vectors, we can define phonetic similarity between the characters (and underlying phonemes). The library implements some measures for phonetic similarity between the characters (and underlying phonemes). These can be defined using the phonetic feature vectors discussed earlier, so users can implement additional similarity measures. # # The implemented similarity measures are: # # - cosine # - dice # - jaccard # - dot_product # - sim1 (Kunchukuttan _et al._, 2016) # - softmax # # ** References ** # # Anoop Kunchukuttan, Pushpak Bhattacharyya, Mitesh Khapra. _Substring-based unsupervised transliteration with phonetic and contextual knowledge_. SIGNLL Conference on Computational Natural Language Learning ** (CoNLL 2016) **. 2016. # In[33]: from indicnlp.script import indic_scripts as isc from indicnlp.script import phonetic_sim as psim c1='क' c2='ख' c3='भ' lang='hi' print('Similarity between {} and {}'.format(c1,c2)) print(psim.cosine( isc.get_phonetic_feature_vector(c1,lang), isc.get_phonetic_feature_vector(c2,lang) )) print() print(u'Similarity between {} and {}'.format(c1,c3)) print(psim.cosine( isc.get_phonetic_feature_vector(c1,lang), isc.get_phonetic_feature_vector(c3,lang) )) # _You may have figured out that you can also compute similarities of characters belonging to different scripts._ # # You can also get a similarity matrix which contains the similarities between all pairs of characters (within the same script or across scripts). # # Let's see how we can compare the characters across Devanagari and Malayalam scripts # In[35]: from indicnlp.script import indic_scripts as isc from indicnlp.script import phonetic_sim as psim slang='hi' tlang='ml' sim_mat=psim.create_similarity_matrix(psim.cosine,slang,tlang,normalize=False) c1='क' c2='ഖ' print('Similarity between {} and {}'.format(c1,c2)) print(sim_mat[isc.get_offset(c1,slang),isc.get_offset(c2,tlang)]) # Some similarity functions like `sim` do not generate values in the range [0,1] and it may be more convenient to have the similarity values in the range [0,1]. This can be achieved by setting the `normalize` paramter to `True` # In[37]: slang='hi' tlang='ml' sim_mat=psim.create_similarity_matrix(psim.sim1,slang,tlang,normalize=True) c1='क' c2='ഖ' print(u'Similarity between {} and {}'.format(c1,c2)) print(sim_mat[isc.get_offset(c1,slang),isc.get_offset(c2,tlang)]) # ### Lexical Similarity # In[10]: from indicnlp.script import indic_scripts as isc from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator lang1_str='पिछले दिनों हम लोगों ने कई उत्सव मनाये. कल, हिन्दुस्तान भर में श्री कृष्ण जन्म-महोत्सव मनाया गया.' lang2_str='વીતેલા દિવસોમાં આપણે કેટલાય ઉત્સવો ઉજવ્યા. હજી ગઇકાલે જ પૂરા હિંદુસ્તાનમાં શ્રીકૃષ્ણ જન્મોત્સવ ઉજવવામાં આવ્યો.' lang1='hi' lang2='gu' lcsr, len1, len2 = isc.lcsr_indic(lang1_str,lang2_str,lang1,lang2) print('{} string: {}'.format(lang1, lang1_str)) print('{} string: {}'.format(lang2, UnicodeIndicTransliterator.transliterate(lang2_str,lang2,lang1))) print('Both strings are shown in Devanagari script using script conversion for readability.') print('LCSR: {}'.format(lcsr)) # ## Orthographic Syllabification # # _Orthographic Syllabification_ is an approximate syllabification process for Indic scripts, where CV+ units are defined to be _orthographic syllables_. # # See the following paper for details: # # Anoop Kunchukuttan, Pushpak Bhattacharyya. [_Orthographic Syllable as basic unit for SMT between Related Languages_](https://arxiv.org/abs/1610.00634). Conference on Empirical Methods in Natural Language Processing **(EMNLP 2016)**. 2016. # In[39]: from indicnlp.syllable import syllabifier w='जगदीशचंद्र' lang='hi' print(' '.join(syllabifier.orthographic_syllabify(w,lang))) # ## Word Segmentation # # Unsupervised morphological analysers for various Indian language. Given a word, the analyzer returns the componenent morphemes. # The analyzer can recognize inflectional and derivational morphemes. # # The following languages are supported: # # _Hindi, Punjabi, Marathi, Konkani, Gujarati, Bengali, Kannada, Tamil, Telugu, Malayalam_ # # Support for more languages will be added soon. # In[6]: from indicnlp.morph import unsupervised_morph from indicnlp import common analyzer=unsupervised_morph.UnsupervisedMorphAnalyzer('mr') # In[7]: indic_string='आपल्या हिरड्यांच्या आणि दातांच्यामध्ये जीवाणू असतात .' analyzes_tokens=analyzer.morph_analyze_document(indic_string.split(' ')) for w in analyzes_tokens: print(w) # ## Transliteration # We use the [_BrahmiNet_](http://www.cfilt.iitb.ac.in/brahminet/static/rest.html) REST API for transliteration. # In[11]: import json import requests from urllib.parse import quote text=quote('manish joe') # text=quote('मनिश् जोए') url='http://www.cfilt.iitb.ac.in/indicnlpweb/indicnlpws/transliterate_bulk/en/hi/{}/statistical'.format(text) print(url) response = requests.get(url) response.json() # You can also use _BrahmiNet_ through [this](http://www.cfilt.iitb.ac.in/brahminet) web interface. # # You can read more about _BrahmiNet_ [here](http://www.cfilt.iitb.ac.in/brahminet/static/publications/brahminet_naacl2015.pdf) # ### Acronyms # # Acronyms have a different behaviour while transliterating. Hence, a rule-based transliterator for transliterating English acronyms to Indian languages is available. # # This can also be used to generate synthetic transliteration data to train a Indian language to English transliterator for acronyms. # In[18]: from indicnlp.transliterate import acronym_transliterator ack_transliterator=acronym_transliterator.LatinToIndicAcronymTransliterator() ack_transliterator.transliterate('ICICI',lang='hi') # ## Machine Translation # We use the [_Shata-anuvaadak_](http://www.cfmilt.iitb.ac.in/indic-translator) for translation. # You can read more about _Shata-anuvaadak_ [here](http://www.lrec-conf.org/proceedings/lrec2014/pdf/414_Paper.pdf) # In[31]: import json import requests from urllib.parse import quote text=quote('Mumbai is the capital of Maharashtra') # text=quote('मनिश् जोए') url='http://www.cfilt.iitb.ac.in/indicnlpweb/indicnlpws/translate/en/mr/{}/'.format(text) ## Note the forward slash '/' at the end of the URL. It's should be there, but please live with it for now! print(url) response = requests.get(url) response.json() # In[ ]: