From 4e04fbe99749bd83c0be301d7f3b26f0c4e49ecb Mon Sep 17 00:00:00 2001 From: Ryan McGrath Date: Wed, 11 Apr 2012 06:30:19 -0400 Subject: [PATCH] Initial, etc --- LICENSE | 21 +++ jTransliterate/__init__.py | 254 +++++++++++++++++++++++++++++ jTransliterate/test.py | 30 ++++ jTransliterate/translation_maps.py | 128 +++++++++++++++ readme.md | 57 +++++++ setup.py | 34 ++++ 6 files changed, 524 insertions(+) create mode 100644 LICENSE create mode 100644 jTransliterate/__init__.py create mode 100644 jTransliterate/test.py create mode 100644 jTransliterate/translation_maps.py create mode 100644 readme.md create mode 100644 setup.py diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..cd5b253 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +The MIT License + +Copyright (c) 2009 - 2010 Ryan McGrath + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/jTransliterate/__init__.py b/jTransliterate/__init__.py new file mode 100644 index 0000000..9f388f7 --- /dev/null +++ b/jTransliterate/__init__.py @@ -0,0 +1,254 @@ +# -*- coding: utf-8 -*- +#!/usr/bin/python + +__author__ = "Ryan McGrath " +__version__ = "1.0" + +""" + A class that allows for easy transliteration of [Hirag/Katak]ana + and English/Latin. Much of the work here is inspired/inherited/etc + from Kim Ahlström and his work on "Ve", built in Ruby. + + Credit where credit is due: + https://github.com/Kimtaro/ve/blob/master/lib/providers/japanese_transliterators.rb +""" + +import re + +# Lookup tables for character conversions. Much of this is borrowed from the work of +# Kim Ahlström and Ve: https://github.com/Kimtaro/ve/ +# +# Ve's Transliterators are written in Ruby, and I wanted Python. Consider it a nice port. ;) +from translation_maps import H_SYLLABIC_N, H_SMALL_TSU, HIRA_TO_LATN, LATN_TO_HIRA + +def defaultToSelfText(fn): + """ + A fun little decorator that makes it so we can default to + the text stored on a class instance, but also let people just + instantiate and re-use calls while supplying new text. Whee. + """ + def wrapper(self, text = None): + if text is None: + text = self.text + + return fn(self, text = text) + + return wrapper + +class JapaneseTransliterator(object): + def __init__(self, text): + """__init__(self, text) + + JapaneseTransliterator("fadjfnjsfnjsafnjsdnf") + + I envisioned storing the original text on the instantiated object + itself, and allowing it to be overridden on a per-function-call basis. + + So I did. + + Parameters: + text - Text to be operated on. Unicode please! + """ + self.text = text + + @defaultToSelfText + def transliterate_from_hrkt_to_latn(self, text): + """transliterate_from_hrkt_to_latn(self, text) + + Transliterates from [Hirag/Katak]ana to Latin/En. + + Parameters: + text - Optional. Use different text than what's on + the class instance. + """ + text = self.transliterate_from_kana_to_hira(text) + return self.transliterate_from_hira_to_latn(text) + + @defaultToSelfText + def transliterate_from_hira_to_latn(self, text): + """transliterate_from_hira_to_latn(self, text) + + Transliterates from Hiragana to Latin/En. Phonetics, that is. + + Parameters: + text - Optional. Use different text than what's on + the class instance. + """ + # Decode once, not twice + _H_SMALL_TSU = H_SMALL_TSU.decode('utf-8') + _H_SYLLABIC_N = H_SYLLABIC_N.decode('utf-8') + + kana = (text * 1).decode('utf-8') + romaji = '' + geminate = False + + index = 0 + klength = len(kana) + + while klength > 0: + for length in [2, 1]: + mora = '' + for_conversion = kana[index:(index + length)] + + if for_conversion == _H_SMALL_TSU: + geminate = True + index += length + klength -= length + break + + elif for_conversion == _H_SYLLABIC_N and re.match(u'[\u3084-\u3088]', kana[(index + 1):(index + 2)]): + # Syllabic N before ya, yu or yo + mora = "n'" + elif for_conversion in HIRA_TO_LATN: + mora = HIRA_TO_LATN[for_conversion] + + if len(mora) > 0: + if geminate: + geminate = False + romaji += mora[index:index + 1] + + romaji += mora + index += length + klength -= length + break + elif length == 1: + romaji += for_conversion + index += length + klength -= length + + return romaji + + @defaultToSelfText + def transliterate_from_latn_to_hrkt(self, text): + """transliterate_from_latn_to_hrkt(self, text) + + Transliterates from Latin/En to Hiragana (mostly). + + Parameters: + text - Optional. Use different text than what's on + the class instance. + """ + # Duplicate the text... + romaji = text * 1 + kana = '' + + romaji = re.sub('/m([BbPp])/', 'n\1', romaji) + romaji = re.sub('/M([BbPp])/', 'N\1', romaji) + + index = 0 + rlength = len(romaji) - 1 + + while rlength > 0: + for for_removal in [3, 2, 1]: + mora = '' + for_conversion = romaji[index:(index + for_removal)] + is_upper = True if re.search('[A-Z][^A-Z]*', for_conversion) else False + for_conversion = for_conversion.lower() + + if re.match('/nn[aiueo]/', for_conversion): + mora = H_SYLLABIC_N + for_removal = 1 + elif for_conversion in LATN_TO_HIRA: + mora = LATN_TO_HIRA[for_conversion] + elif for_conversion == 'tch' or (for_removal == 2 and re.match('/([kgsztdnbpmyrlwc])\1/', for_conversion)): + mora = H_SMALL_TSU + for_removal = 1 + + if mora != '': + if is_upper: + kana += self.transliterate_from_hira_to_kana(text = (mora * 1)) + else: + kana += mora + + index += for_removal + rlength -= for_removal + break + elif for_removal == 1: + kana += for_conversion + index += 1 + rlength -= 1 + + return kana + + @defaultToSelfText + def transliterate_from_kana_to_hira(self, text): + """transliterate_from_kana_to_hira(self, text) + + Transliterates from Katakana to Hiragana. + + Parameters: + text - Optional. Use different text than what's on + the class instance. + """ + return JapaneseTransliterator.transpose_codepoints_in_range(text, -96, 12449, 12534) + + @defaultToSelfText + def transliterate_from_hira_to_kana(self, text): + """transliterate_from_hira_to_kana(self, text) + + Transliterates from Hiragana to Katakana. + + Parameters: + text - Optional. Use different text than what's on + the class instance. + """ + return JapaneseTransliterator.transpose_codepoints_in_range(text, 96, 12353, 12438) + + @defaultToSelfText + def transliterate_from_fullwidth_to_halfwidth(self, text): + """transliterate_from_fullwidth_to_halfwidth(self, text) + + Transliterates from full-width to half-width. + + Parameters: + text - Optional. Use different text than what's on + the class instance. + """ + text = JapaneseTransliterator.transpose_codepoints_in_range(text, -65248, 65281, 65374) + return JapaneseTransliterator.transpose_codepoints_in_range(text, -12256, 12288, 12288) + + @defaultToSelfText + def transliterate_from_halfwidth_to_fullwidth(self, text): + """transliterate_from_fullwidth_to_halfwidth(self, text) + + Transliterates from half-width to full-width. + + Parameters: + text - Optional. Use different text than what's on + the class instance. + """ + text = JapaneseTransliterator.transpose_codepoints_in_range(text, 65248, 33, 126) + return JapaneseTransliterator.transpose_codepoints_in_range(text, 12256, 32, 32) + + @staticmethod + def transpose_codepoints_in_range(text, distance, range_start, range_end): + """JapaneseTransliterator.transpose_codepoints_in_range(text, distance, range_start, range_end) + + Given a set of text (unicode...), coupled with distance and range, transposes + it for a corresponding swap and returns the new set. + + Parameters: + text - text to be transposed, codepoint-wise + distance - to the other side of the map + range_start - start of the range we're interested in, codepont-wise + range_end - end of the range we're interested in, codepoint-wise + + Returns: + string, text, etc + """ + if not isinstance(text, unicode): + # Python will raise a UnicodeEncodeError here if there are any + # outstanding issues, otherwise things should be fine. *shrug* + text = unicode(text, 'utf-8') + + transposed_text = u'' + codepoints = map(lambda char: ord(char), list(text)) + + for codepoint in codepoints: + print codepoint + if codepoint >= range_start and codepoint <= range_end: + transposed_text += unichr(codepoint + distance) + else: + transposed_text += unichr(codepoint) + + return transposed_text \ No newline at end of file diff --git a/jTransliterate/test.py b/jTransliterate/test.py new file mode 100644 index 0000000..79b710c --- /dev/null +++ b/jTransliterate/test.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- + +from __init__ import JapaneseTransliterator + +# Transliterate from Latin/English to [Hirag/Katak]ana +x = JapaneseTransliterator('kanazawa') +print x.transliterate_from_latn_to_hrkt() +# Should print "かなざわ" + +# Transliterate from Hiragana to Latin/English +b = JapaneseTransliterator('かなざわ') +print b.transliterate_from_hira_to_latn() +# Should print "kanazawa" + +# Transliterate from either Hiragana or Katakana to Latin/English +print b.transliterate_from_hrkt_to_latn(text = 'カナザワ') +# Should print "kanazawa" + +# Transliterate from Katakan to Hiragana (You... probably never need to do this) +print b.transliterate_from_kana_to_hira(text = 'キットカート') +# Should print "きっとかーと" + +# Transliterate from Hiragana to Katakana +print b.transliterate_from_hira_to_kana(text = 'かなざわ') +# Should print "カナザワ" + +# If you want to convert between half/full width kana, you can use the following +# functions. I didn't care enough to do demos here. ;| +b.transliterate_from_halfwidth_to_fullwidth() +b.transliterate_from_fullwidth_to_halfwidth() diff --git a/jTransliterate/translation_maps.py b/jTransliterate/translation_maps.py new file mode 100644 index 0000000..7a89963 --- /dev/null +++ b/jTransliterate/translation_maps.py @@ -0,0 +1,128 @@ +# -*- coding: utf-8 -*- + +H_SYLLABIC_N= 'ん' +H_SMALL_TSU = 'っ' + +""" + +Python sucks with regards to unicode-fun, but I'm leaving this here as +a fun reference for anyone deciphering all this. Enjoy. -- Ryan + +HIRA_TO_LATN = { + "あ":"a", "い":"i", "う":"", "え":"e", "お":"o", + "か":"ka", "き":"ki", "く":"k", "け":"ke", "こ":"ko", + "が":"ga", "ぎ":"gi", "ぐ":"g", "げ":"ge", "ご":"go", + "さ":"sa", "し":"shi", "す":"s", "せ":"se", "そ":"so", + "ざ":"za", "じ":"ji", "ず":"z", "ぜ":"ze", "ぞ":"zo", + "た":"ta", "ち":"chi", "つ":"ts", "て":"te", "と":"to", + "だ":"da", "ぢ":"ji", "づ":"z", "で":"de", "ど":"do", + "な":"na", "に":"ni", "ぬ":"n", "ね":"ne", "の":"no", + "は":"ha", "ひ":"hi", "ふ":"f", "へ":"he", "ほ":"ho", + "ば":"ba", "び":"bi", "ぶ":"b", "べ":"be", "ぼ":"bo", + "ぱ":"pa", "ぴ":"pi", "ぷ":"p", "ぺ":"pe", "ぽ":"po", + "ま":"ma", "み":"mi", "む":"m", "め":"me", "も":"mo", + "や":"ya", "ゆ":"y", "よ":"yo", + "ら":"ra", "り":"ri", "る":"r", "れ":"re", "ろ":"ro", + "わ":"wa", "うぃ":"whi", "うぇ":"whe", "を":"wo", + "ゑ":"wye", "ゐ":"wyi", "ー":"-", "ん":"n", + + "きゃ":"kya", "きゅ":"ky", "きょ":"kyo", "きぇ":"kye", "きぃ":"kyi", + "ぎゃ":"gya", "ぎゅ":"gy", "ぎょ":"gyo", "ぎぇ":"gye", "ぎぃ":"gyi", + "くぁ":"kwa", "くぃ":"kwi", "くぅ":"kw", "くぇ":"kwe", "くぉ":"kwo", + "ぐぁ":"qwa", "ぐぃ":"gwi", "ぐぅ":"gw", "ぐぇ":"gwe", "ぐぉ":"gwo", + "しゃ":"sha", "しぃ":"syi", "しゅ":"sh", "しぇ":"she", "しょ":"sho", + "じゃ":"jya", "じゅ":"zy", "じぇ":"zye", "じょ":"zyo", "じぃ":"zyi", + "すぁ":"swa", "すぃ":"swi", "すぅ":"sw", "すぇ":"swe", "すぉ":"swo", + "ちゃ":"tya", "ちゅ":"ty", "ちぇ":"tye", "ちょ":"tyo", "ちぃ":"tyi", + "ぢゃ":"dya", "ぢぃ":"dyi", "ぢゅ":"dy", "ぢぇ":"dye", "ぢょ":"dyo", + "つぁ":"tsa", "つぃ":"tsi", "つぇ":"tse", "つぉ":"tso", "てゃ":"tha", + "てぃ":"thi", "てゅ":"th", "てぇ":"the", "てょ":"tho", "とぁ":"twa", + "とぃ":"twi", "とぅ":"tw", "とぇ":"twe", "とぉ":"two", "でゃ":"dha", + "でぃ":"dhi", "でゅ":"dh", "でぇ":"dhe", "でょ":"dho", "どぁ":"dwa", + "どぃ":"dwi", "どぅ":"dw", "どぇ":"dwe", "どぉ":"dwo", "にゃ":"nya", + "にゅ":"ny", "にょ":"nyo", "にぇ":"nye", "にぃ":"nyi", "ひゃ":"hya", + "ひぃ":"hyi", "ひゅ":"hy", "ひぇ":"hye", "ひょ":"hyo", "びゃ":"bya", + "びぃ":"byi", "びゅ":"by", "びぇ":"bye", "びょ":"byo", "ぴゃ":"pya", + "ぴぃ":"pyi", "ぴゅ":"py", "ぴぇ":"pye", "ぴょ":"pyo", "ふぁ":"fwa", + "ふぃ":"fyi", "ふぇ":"fye", "ふぉ":"fwo", "ふぅ":"fw", "ふゃ":"fya", + "ふゅ":"fy", "ふょ":"fyo", "みゃ":"mya", "みぃ":"myi", "みゅ":"my", + "みぇ":"mye", "みょ":"myo", "りゃ":"rya", "りぃ":"ryi", "りゅ":"ry", + "りぇ":"rye", "りょ":"ryo", + "ゔぁ":"va", "ゔぃ":"vyi", "ゔ":"v", "ゔぇ":"vye", "ゔぉ":"vo", + "ゔゃ":"vya", "ゔゅ":"vy", "ゔょ":"vyo", + "うぁ":"wha", "いぇ":"ye", "うぉ":"who", + "ぁ":"xa", "ぃ":"xi", "ぅ":"x", "ぇ":"xe", "ぉ":"xo", + "ゕ":"xka", "ゖ":"xke", "ゎ":"xwa" +} +""" +HIRA_TO_LATN = {u'\u3057\u3047': 'she', u'\u3057\u3043': 'syi', u'\u308b': 'ru', u'\u3093': 'n', u'\u3074\u3047': 'pye', u'\u3074\u3043': 'pyi', u'\u304d\u3085': 'kyu', u'\u304d\u3087': 'kyo', u'\u304d\u3083': 'kya', u'\u3067\u3047': 'dhe', u'\u3050\u3041': 'qwa', u'\u3067\u3043': 'dhi', u'\u3094\u3087': 'vyo', u'\u308a\u3043': 'ryi', u'\u3094\u3083': 'vya', u'\u3048': 'e', u'\u3050': 'gu', u'\u3058': 'ji', u'\u3060': 'da', u'\u3064\u3049': 'tso', u'\u3064\u3047': 'tse', u'\u3064\u3043': 'tsi', u'\u3064\u3041': 'tsa', u'\u3070': 'ba', u'\u3078': 'he', u'\u3080': 'mu', u'\u3088': 'yo', u'\u3043': 'xi', u'\u3090': 'wyi', u'\u3050\u3043': 'gwi', u'\u3072': 'hi', u'\u3050\u3047': 'gwe', u'\u3050\u3045': 'gwu', u'\u3050\u3049': 'gwo', u'\u3057\u3087': 'sho', u'\u3057\u3085': 'shu', u'\u3057\u3083': 'sha', u'\u304b': 'ka', u'\u3053': 'ko', u'\u3074\u3087': 'pyo', u'\u305b': 'se', u'\u3074\u3085': 'pyu', u'\u3074\u3083': 'pya', u'\u304d\u3047': 'kye', u'\u3068\u3041': 'twa', u'\u304d\u3043': 'kyi', u'\u306b': 'ni', u'\u3067\u3087': 'dho', u'\u3067\u3085': 'dhu', u'\u3067\u3083': 'dha', u'\u3094\u3049': 'vo', u'\u3094\u3047': 'vye', u'\u307b': 'ho', u'\u3094\u3043': 'vyi', u'\u3094\u3041': 'va', u'\u3081': 'me', u'\u3089': 'ra', u'\u3091': 'wye', u'\u3046\u3041': 'wha', u'\u3046\u3043': 'whi', u'\u3046\u3047': 'whe', u'\u3073\u3083': 'bya', u'\u3046\u3049': 'who', u'\u3073\u3087': 'byo', u'\u3073\u3085': 'byu', u'\u3066\u3083': 'tha', u'\u3066\u3085': 'thu', u'\u3066\u3087': 'tho', u'\u3046': 'u', u'\u304e': 'gi', u'\u3056': 'za', u'\u308a\u3047': 'rye', u'\u305e': 'zo', u'\u3094\u3085': 'vyu', u'\u3066': 'te', u'\u306e': 'no', u'\u3076': 'bu', u'\u307e': 'ma', u'\u3059\u3049': 'swo', u'\u3086': 'yu', u'\u3059\u3041': 'swa', u'\u3059\u3043': 'swi', u'\u3059\u3045': 'swu', u'\u3059\u3047': 'swe', u'\u308e': 'xwa', u'\u3096': 'xke', u'\u308a\u3085': 'ryu', u'\u308a\u3087': 'ryo', u'\u308a\u3083': 'rya', u'\u3073': 'bi', u'\u3069\u3049': 'dwo', u'\u3069\u3041': 'dwa', u'\u3069\u3043': 'dwi', u'\u3069\u3045': 'dwu', u'\u3069\u3047': 'dwe', u'\u3041': 'xa', u'\u3049': 'xo', u'\u3051': 'ke', u'\u3073\u3043': 'byi', u'\u3073\u3047': 'bye', u'\u3061': 'chi', u'\u3069': 'do', u'\u3071': 'pa', u'\u3066\u3043': 'thi', u'\u3066\u3047': 'the', u'\u3079': 'be', u'\u308f': 'wa', u'\u3062\u3085': 'dyu', u'\u3062\u3087': 'dyo', u'\u3062\u3083': 'dya', u'\u307f\u3087': 'myo', u'\u307f\u3085': 'myu', u'\u307f\u3083': 'mya', u'\u3044': 'i', u'\u304c': 'ga', u'\u3072\u3085': 'hyu', u'\u3072\u3087': 'hyo', u'\u3054': 'go', u'\u3072\u3083': 'hya', u'\u305c': 'ze', u'\u3064': 'tsu', u'\u304f\u3049': 'kwo', u'\u304f\u3047': 'kwe', u'\u304f\u3045': 'kwu', u'\u304f\u3043': 'kwi', u'\u306c': 'nu', u'\u304f\u3041': 'kwa', u'\u3074': 'pi', u'\u3068': 'to', u'\u307c': 'bo', u'\u3084': 'ya', u'\u308c': 're', u'\u3072\u3047': 'hye', u'\u3094': 'vu', u'\u3072\u3043': 'hyi', u'\u3045': 'xu', u'\u3047': 'xe', u'\u304f': 'ku', u'\u3057': 'shi', u'\u305f': 'ta', u'\u3062\u3047': 'dye', u'\u3067': 'de', u'\u3062\u3043': 'dyi', u'\u306f': 'ha', u'\u3077': 'pu', u'\u307f\u3047': 'mye', u'\u307f\u3043': 'myi', u'\u30fc': '-', u'\u307f': 'mi', u'\u306b\u3083': 'nya', u'\u306b\u3087': 'nyo', u'\u306b\u3085': 'nyu', u'\u308d': 'ro', u'\u3059': 'su', u'\u3095': 'xka', u'\u304e\u3043': 'gyi', u'\u304e\u3047': 'gye', u'\u3042': 'a', u'\u3058\u3043': 'zyi', u'\u304a': 'o', u'\u3058\u3047': 'zye', u'\u3052': 'ge', u'\u3075\u3049': 'fwo', u'\u3075\u3045': 'fwu', u'\u3075\u3047': 'fye', u'\u305a': 'zu', u'\u3075\u3041': 'fwa', u'\u3075\u3043': 'fyi', u'\u3061\u3083': 'tya', u'\u3062': 'ji', u'\u3061\u3085': 'tyu', u'\u3061\u3087': 'tyo', u'\u306a': 'na', u'\u3044\u3047': 'ye', u'\u3068\u3049': 'two', u'\u3068\u3043': 'twi', u'\u307a': 'pe', u'\u3068\u3047': 'twe', u'\u3068\u3045': 'twu', u'\u3082': 'mo', u'\u3058\u3083': 'jya', u'\u308a': 'ri', u'\u3058\u3087': 'zyo', u'\u3058\u3085': 'zyu', u'\u3092': 'wo', u'\u3075\u3085': 'fyu', u'\u3075\u3087': 'fyo', u'\u3075\u3083': 'fya', u'\u3061\u3043': 'tyi', u'\u3061\u3047': 'tye', u'\u306b\u3043': 'nyi', u'\u306b\u3047': 'nye', u'\u304d': 'ki', u'\u3055': 'sa', u'\u305d': 'so', u'\u3065': 'zu', u'\u304e\u3083': 'gya', u'\u306d': 'ne', u'\u304e\u3085': 'gyu', u'\u304e\u3087': 'gyo', u'\u3075': 'fu', u'\u307d': 'po'} + +LATN_TO_HIRA = { + 'a': 'あ', 'i': 'い', 'u': 'う', 'e': 'え', 'o': 'お', + 'ka': 'か', 'ki': 'き', 'ku': 'く', 'ke': 'け', 'ko': 'こ', + 'ga': 'が', 'gi': 'ぎ', 'gu': 'ぐ', 'ge': 'げ', 'go': 'ご', + 'sa': 'さ', 'si': 'し', 'shi': 'し', 'su': 'す', 'se': 'せ', 'so': 'そ', + 'za': 'ざ', 'zi': 'じ', 'ji': 'じ', 'zu': 'ず', 'ze': 'ぜ', 'zo': 'ぞ', + 'ta': 'た', 'ti': 'ち', 'chi': 'ち', 'tu': 'つ', 'tsu': 'つ', 'te': 'て','to': 'と', + 'da': 'だ', 'di': 'ぢ', 'du': 'づ', 'dzu': 'づ', 'de': 'で','do': 'ど', + 'na': 'な', 'ni': 'に', 'nu': 'ぬ','ne': 'ね','no': 'の', + 'ha': 'は', 'hi': 'ひ', 'hu': 'ふ', 'fu': 'ふ', 'he': 'へ','ho': 'ほ', + 'ba': 'ば', 'bi': 'び', 'bu': 'ぶ','be': 'べ','bo': 'ぼ', + 'pa': 'ぱ', 'pi': 'ぴ', 'pu': 'ぷ','pe': 'ぺ','po': 'ぽ', + 'ma': 'ま', 'mi': 'み', 'mu': 'む','me': 'め','mo': 'も', + 'ya': 'や', 'yu': 'ゆ', 'yo': 'よ', + 'ra': 'ら', 'ri': 'り', 'ru': 'る','re': 'れ','ro': 'ろ', + 'la': 'ら', 'li': 'り', 'lu': 'る','le': 'れ','lo': 'ろ', + 'wa': 'わ', 'wi': 'うぃ', 'we': 'うぇ', 'wo': 'を', + 'wye': 'ゑ', 'wyi': 'ゐ', '-': 'ー', + + 'n': 'ん', 'nn': 'ん', "n'": 'ん', + + 'kya': 'きゃ', 'kyu': 'きゅ', 'kyo': 'きょ', 'kye': 'きぇ', 'kyi': 'きぃ', + 'gya': 'ぎゃ', 'gyu': 'ぎゅ', 'gyo': 'ぎょ', 'gye': 'ぎぇ', 'gyi': 'ぎぃ', + 'kwa': 'くぁ', 'kwi': 'くぃ', 'kwu': 'くぅ', 'kwe': 'くぇ', 'kwo': 'くぉ', + 'gwa': 'ぐぁ', 'gwi': 'ぐぃ', 'gwu': 'ぐぅ', 'gwe': 'ぐぇ', 'gwo': 'ぐぉ', + 'qwa': 'ぐぁ', 'gwi': 'ぐぃ', 'gwu': 'ぐぅ', 'gwe': 'ぐぇ', 'gwo': 'ぐぉ', + + 'sya': 'しゃ', 'syi': 'しぃ', 'syu': 'しゅ', 'sye': 'しぇ', 'syo': 'しょ', + 'sha': 'しゃ','shu': 'しゅ', 'she': 'しぇ', 'sho': 'しょ', + 'ja': 'じゃ','ju': 'じゅ', 'je': 'じぇ', 'jo': 'じょ', + 'jya': 'じゃ', 'jyi': 'じぃ', 'jyu': 'じゅ', 'jye': 'じぇ', 'jyo': 'じょ', + 'zya': 'じゃ', 'zyu': 'じゅ', 'zyo': 'じょ', 'zye': 'じぇ', 'zyi': 'じぃ', + 'swa': 'すぁ', 'swi': 'すぃ', 'swu': 'すぅ', 'swe': 'すぇ', 'swo': 'すぉ', + + 'cha': 'ちゃ','chu': 'ちゅ', 'che': 'ちぇ', 'cho': 'ちょ', + 'cya': 'ちゃ', 'cyi': 'ちぃ', 'cyu': 'ちゅ', 'cye': 'ちぇ', 'cyo': 'ちょ', + 'tya': 'ちゃ', 'tyi': 'ちぃ', 'tyu': 'ちゅ', 'tye': 'ちぇ', 'tyo': 'ちょ', + 'dya': 'ぢゃ', 'dyi': 'ぢぃ', 'dyu': 'ぢゅ', 'dye': 'ぢぇ', 'dyo': 'ぢょ', + 'tsa': 'つぁ', 'tsi': 'つぃ','tse': 'つぇ', 'tso': 'つぉ', + 'tha': 'てゃ', 'thi': 'てぃ', 'thu': 'てゅ', 'the': 'てぇ', 'tho': 'てょ', + 'twa': 'とぁ', 'twi': 'とぃ', 'twu': 'とぅ', 'twe': 'とぇ', 'two': 'とぉ', + 'dha': 'でゃ', 'dhi': 'でぃ', 'dhu': 'でゅ', 'dhe': 'でぇ', 'dho': 'でょ', + 'dwa': 'どぁ', 'dwi': 'どぃ', 'dwu': 'どぅ', 'dwe': 'どぇ', 'dwo': 'どぉ', + + 'nya': 'にゃ', 'nyu': 'にゅ', 'nyo': 'にょ', 'nye': 'にぇ', 'nyi': 'にぃ', + + 'hya': 'ひゃ', 'hyi': 'ひぃ', 'hyu': 'ひゅ', 'hye': 'ひぇ', 'hyo': 'ひょ', + 'bya': 'びゃ', 'byi': 'びぃ', 'byu': 'びゅ', 'bye': 'びぇ', 'byo': 'びょ', + 'pya': 'ぴゃ', 'pyi': 'ぴぃ', 'pyu': 'ぴゅ', 'pye': 'ぴぇ', 'pyo': 'ぴょ', + 'fa': 'ふぁ', 'fi': 'ふぃ','fe': 'ふぇ', 'fo': 'ふぉ', + 'fwa': 'ふぁ', 'fwi': 'ふぃ', 'fwu': 'ふぅ', 'fwe': 'ふぇ', 'fwo': 'ふぉ', + 'fya': 'ふゃ', 'fyi': 'ふぃ', 'fyu': 'ふゅ', 'fye': 'ふぇ', 'fyo': 'ふょ', + + 'mya': 'みゃ', 'myi': 'みぃ', 'myu': 'みゅ', 'mye': 'みぇ', 'myo': 'みょ', + + 'rya': 'りゃ', 'ryi': 'りぃ', 'ryu': 'りゅ', 'rye': 'りぇ', 'ryo': 'りょ', + 'lya': 'りゃ', 'lyu': 'りゅ', 'lyo': 'りょ', 'lye': 'りぇ', 'lyi': 'りぃ', + + 'va': 'ゔぁ', 'vi': 'ゔぃ', 'vu': 'ゔ','ve': 'ゔぇ', 'vo': 'ゔぉ', + 'vya': 'ゔゃ', 'vyi': 'ゔぃ', 'vyu': 'ゔゅ', 'vye': 'ゔぇ', 'vyo': 'ゔょ', + 'wha': 'うぁ', 'whi': 'うぃ', 'ye': 'いぇ', 'whe': 'うぇ', 'who': 'うぉ', + + 'xa': 'ぁ', 'xi': 'ぃ', 'xu': 'ぅ', 'xe': 'ぇ', 'xo': 'ぉ', + 'xya': 'ゃ', 'xyu': 'ゅ', 'xyo': 'ょ', + 'xtu': 'っ', 'xtsu': 'っ', + 'xka': 'ゕ', 'xke': 'ゖ', 'xwa': 'ゎ', + + '@@': ' ', '#[': '「', '#]': '」', '#,': '、', '#.': '。', '#/': '・', +} diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..3e78247 --- /dev/null +++ b/readme.md @@ -0,0 +1,57 @@ +jTransliterate - [Hirag/Katak]ana to Latin/English & Back +=========================================================================== +Sometimes you may want to convert from Hiragana to Katakana, or back again, or... +I dunno, maybe you wanna get the English pronunciation of these words. I'll +be honest and say it's of no concern or interest to me, but I needed this in +Python and so I ported it, figured I'd release it. + +It's MIT licensed. Credit for much of this also belongs to Kim Ahlström and +his linguistics/etc work on **[Ve](https://github.com/Kimtaro/ve/blob/master/lib/providers/japanese_transliterators.rb)**. + + +Installation +--------------------------------------------------------------------------- + pip install jTransliterate + + +Examples && Documentation +--------------------------------------------------------------------------- +``` python +# -*- coding: utf-8 -*- + +from jTransliterate import JapaneseTransliterator + +# Transliterate from Latin/English to [Hirag/Katak]ana +x = JapaneseTransliterator('kanazawa') +print x.transliterate_from_latn_to_hrkt() +# Should print "かなざわ" + +# Transliterate from Hiragana to Latin/English +b = JapaneseTransliterator('かなざわ') +print b.transliterate_from_hira_to_latn() +# Should print "kanazawa" + +# Transliterate from either Hiragana or Katakana to Latin/English +print b.transliterate_from_hrkt_to_latn(text = 'カナザワ') +# Should print "kanazawa" + +# Transliterate from Katakan to Hiragana (You... probably never need to do this) +print b.transliterate_from_kana_to_hira(text = 'キットカート') +# Should print "きっとかーと" + +# Transliterate from Hiragana to Katakana +print b.transliterate_from_hira_to_kana(text = 'かなざわ') +# Should print "カナザワ" + +# If you want to convert between half/full width kana, you can use the following +# functions. I didn't care enough to do demos here. ;| +b.transliterate_from_halfwidth_to_fullwidth() +b.transliterate_from_fullwidth_to_halfwidth() +``` + +Questions, Comments, Complaints and/or etc +--------------------------------------------------------------------------- +Hit me up on them Twitters or find me on them internets at the links below. + +Twitter: **[@ryanmcgrath](http://twitter.com/ryanmcgrath/)** +Web: **[Veno Designs](http://venodesigns.net/)** diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..41cd274 --- /dev/null +++ b/setup.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python + +from setuptools import setup +from setuptools import find_packages + +__author__ = 'Ryan McGrath ' +__version__ = '1.0.0' + +setup( + # Basic package information. + name='jTransliterate', + version=__version__, + packages=find_packages(), + + # Packaging options. + include_package_data=True, + + # Metadata for PyPI. + author='Ryan McGrath', + author_email='ryan@venodesigns.net', + license='MIT License', + url='http://github.com/ryanmcgrath/twython/tree/master', + keywords='japanese translation transliterate katakana hiragana latin', + description='Transliterate [Hirag/Katak]ana to Latin/English and back. Convert half/full-width Japanese text.', + long_description=open('readme.md').read(), + classifiers=[ + 'Development Status :: 4 - Beta', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: MIT License', + 'Topic :: Software Development :: Libraries :: Python Modules', + 'Topic :: Communications :: Chat', + 'Topic :: Internet' + ] +)