Source code for jaconv.jaconv

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import unicodedata
from .conv_table import (H2K_TABLE, H2HK_TABLE, K2H_TABLE, H2Z_A, H2Z_AD,
                         H2Z_AK, H2Z_D, H2Z_K, H2Z_DK, H2Z_ALL, Z2H_A, Z2H_AD,
                         Z2H_AK, Z2H_D, Z2H_K, Z2H_DK, Z2H_ALL, KANA2HEP, HEP2KANA)
from .compat import map

consonants = frozenset('sdfghjklqwrtypzxcvbnm')

def _exclude_ignorechar(ignore, conv_map):
    for character in map(ord, ignore):
        conv_map[character] = character
    return conv_map


def _convert(text, conv_map):
    return text.translate(conv_map)


[docs]def hira2kata(text, ignore=''): """Convert Hiragana to Full-width (Zenkaku) Katakana. Parameters ---------- text : str Hiragana string. ignore : str Characters to be ignored in converting. Return ------ str Katakana string. Examples -------- >>> print(jaconv.hira2kata('ともえまみ')) トモエマミ >>> print(jaconv.hira2kata('まどまぎ', ignore='ど')) マどマギ """ if ignore: h2k_map = _exclude_ignorechar(ignore, H2K_TABLE.copy()) return _convert(text, h2k_map) return _convert(text, H2K_TABLE)
[docs]def hira2hkata(text, ignore=''): """Convert Hiragana to Half-width (Hankaku) Katakana Parameters ---------- text : str Hiragana string. ignore : str Characters to be ignored in converting. Return ------ str Half-width Katakana string. Examples -------- >>> print(jaconv.hira2hkata('ともえまみ')) トモエマミ >>> print(jaconv.hira2hkata('ともえまみ', ignore='み')) トモエマみ """ if ignore: h2hk_map = _exclude_ignorechar(ignore, H2HK_TABLE.copy()) return _convert(text, h2hk_map) return _convert(text, H2HK_TABLE)
[docs]def kata2hira(text, ignore=''): """Convert Full-width Katakana to Hiragana Parameters ---------- text : str Full-width Katakana string. ignore : str Characters to be ignored in converting. Return ------ str Hiragana string. Examples -------- >>> print(jaconv.kata2hira('巴マミ')) 巴まみ >>> print(jaconv.kata2hira('マミサン', ignore='ン')) まみさン """ if ignore: k2h_map = _exclude_ignorechar(ignore, K2H_TABLE.copy()) return _convert(text, k2h_map) return _convert(text, K2H_TABLE)
[docs]def h2z(text, ignore='', kana=True, ascii=False, digit=False): """Convert Half-width (Hankaku) Katakana to Full-width (Zenkaku) Katakana Parameters ---------- text : str Half-width Katakana string. ignore : str Characters to be ignored in converting. kana : bool Either converting Kana or not. ascii : bool Either converting ascii or not. digit : bool Either converting digit or not. Return ------ str Full-width Katakana string. Examples -------- >>> print(jaconv.h2z('ティロフィナーレ')) ティロフィナーレ >>> print(jaconv.h2z('ティロフィナーレ', ignore='ィ')) ティロフィナーレ >>> print(jaconv.h2z('abcd', ascii=True)) ABCD >>> print(jaconv.h2z('1234', digit=True)) 1234 """ def _conv_dakuten(text): """Convert Hankaku Dakuten Kana to Zenkaku Dakuten Kana """ text = text.replace("ガ", "ガ").replace("ギ", "ギ") text = text.replace("グ", "グ").replace("ゲ", "ゲ") text = text.replace("ゴ", "ゴ").replace("ザ", "ザ") text = text.replace("ジ", "ジ").replace("ズ", "ズ") text = text.replace("ゼ", "ゼ").replace("ゾ", "ゾ") text = text.replace("ダ", "ダ").replace("ヂ", "ヂ") text = text.replace("ヅ", "ヅ").replace("デ", "デ") text = text.replace("ド", "ド").replace("バ", "バ") text = text.replace("ビ", "ビ").replace("ブ", "ブ") text = text.replace("ベ", "ベ").replace("ボ", "ボ") text = text.replace("パ", "パ").replace("ピ", "ピ") text = text.replace("プ", "プ").replace("ペ", "ペ") return text.replace("ポ", "ポ").replace("ヴ", "ヴ") if ascii: if digit: if kana: h2z_map = H2Z_ALL else: h2z_map = H2Z_AD elif kana: h2z_map = H2Z_AK else: h2z_map = H2Z_A elif digit: if kana: h2z_map = H2Z_DK else: h2z_map = H2Z_D else: h2z_map = H2Z_K if kana: text = _conv_dakuten(text) if ignore: h2z_map = _exclude_ignorechar(ignore, h2z_map.copy()) return _convert(text, h2z_map)
[docs]def z2h(text, ignore='', kana=True, ascii=False, digit=False): """Convert Full-width (Zenkaku) Katakana to Half-width (Hankaku) Katakana Parameters ---------- text : str Full-width Katakana string. ignore : str Characters to be ignored in converting. kana : bool Either converting Kana or not. ascii : bool Either converting ascii or not. digit : bool Either converting digit or not. Return ------ str Half-width Katakana string. Examples -------- >>> print(jaconv.z2h('ティロフィナーレ')) ティロフィナーレ >>> print(jaconv.z2h('ティロフィナーレ', ignore='ィ')) ティロフィナーレ >>> print(jaconv.z2h('ABCD', ascii=True)) abcd >>> print(jaconv.z2h('1234', digit=True)) 1234 """ if ascii: if digit: if kana: z2h_map = Z2H_ALL else: z2h_map = Z2H_AD elif kana: z2h_map = Z2H_AK else: z2h_map = Z2H_A elif digit: if kana: z2h_map = Z2H_DK else: z2h_map = Z2H_D else: z2h_map = Z2H_K if ignore: z2h_map = _exclude_ignorechar(ignore, z2h_map.copy()) return _convert(text, z2h_map)
[docs]def normalize(text, mode='NFKC', ignore=''): """Convert Half-width (Hankaku) Katakana to Full-width (Zenkaku) Katakana, Full-width (Zenkaku) ASCII and DIGIT to Half-width (Hankaku) ASCII and DIGIT. Additionally, Full-width wave dash (〜) etc. are normalized Parameters ---------- text : str Source string. mode : str Unicode normalization mode. ignore : str Characters to be ignored in converting. Return ------ str Normalized string. Examples -------- >>> print(jaconv.normalize('ティロ・フィナ〜レ', 'NFKC')) ティロ・フィナーレ """ text = text.replace('〜', 'ー').replace('~', 'ー') text = text.replace("’", "'").replace('”', '"').replace('“', '``') text = text.replace('―', '-').replace('‐', '-').replace('˗', '-').replace('֊', '-') text = text.replace('‐', '-').replace('‑', '-').replace('‒', '-').replace('–', '-') text = text.replace('⁃', '-').replace('⁻', '-').replace('₋', '-').replace('−', '-') text = text.replace('﹣', 'ー').replace('-', 'ー').replace('—', 'ー').replace('―', 'ー') text = text.replace('━', 'ー').replace('─', 'ー') return unicodedata.normalize(mode, text)
[docs]def kana2alphabet(text): """Convert Hiragana to hepburn-style alphabets Parameters ---------- text : str Hiragana string. Return ------ str Hepburn-style alphabets string. Examples -------- >>> print(jaconv.kana2alphabet('まみさん')) mamisan """ text = text.replace('きゃ', 'kya').replace('きゅ', 'kyu').replace('きょ', 'kyo') text = text.replace('ぎゃ', 'gya').replace('ぎゅ', 'gyu').replace('ぎょ', 'gyo') text = text.replace('しゃ', 'sha').replace('しゅ', 'shu').replace('しょ', 'sho') text = text.replace('じゃ', 'ja').replace('じゅ', 'ju').replace('じょ', 'jo') text = text.replace('ちゃ', 'cha').replace('ちゅ', 'chu').replace('ちょ', 'cho') text = text.replace('にゃ', 'nya').replace('にゅ', 'nyu').replace('にょ', 'nyo') text = text.replace('ふぁ', 'fa').replace('ふぃ', 'fi').replace('ふぇ', 'fe') text = text.replace('ふぉ', 'fo') text = text.replace('ひゃ', 'hya').replace('ひゅ', 'hyu').replace('ひょ', 'hyo') text = text.replace('みゃ', 'mya').replace('みゅ', 'myu').replace('みょ', 'myo') text = text.replace('りゃ', 'rya').replace('りゅ', 'ryu').replace('りょ', 'ryo') text = text.replace('びゃ', 'bya').replace('びゅ', 'byu').replace('びょ', 'byo') text = text.replace('ぴゃ', 'pya').replace('ぴゅ', 'pyu').replace('ぴょ', 'pyo') text = text.replace('が', 'ga').replace('ぎ', 'gi').replace('ぐ', 'gu') text = text.replace('げ', 'ge').replace('ご', 'go').replace('ざ', 'za') text = text.replace('じ', 'ji').replace('ず', 'zu').replace('ぜ', 'ze') text = text.replace('ぞ', 'zo').replace('だ', 'da').replace('ぢ', 'ji') text = text.replace('づ', 'zu').replace('で', 'de').replace('ど', 'do') text = text.replace('ば', 'ba').replace('び', 'bi').replace('ぶ', 'bu') text = text.replace('べ', 'be').replace('ぼ', 'bo').replace('ぱ', 'pa') text = text.replace('ぴ', 'pi').replace('ぷ', 'pu').replace('ぺ', 'pe') text = text.replace('ぽ', 'po') text = text.replace('か', 'ka').replace('き', 'ki').replace('く', 'ku') text = text.replace('け', 'ke').replace('こ', 'ko').replace('さ', 'sa') text = text.replace('し', 'shi').replace('す', 'su').replace('せ', 'se') text = text.replace('そ', 'so').replace('た', 'ta').replace('ち', 'chi') text = text.replace('つ', 'tsu').replace('て', 'te').replace('と', 'to') text = text.replace('な', 'na').replace('に', 'ni').replace('ぬ', 'nu') text = text.replace('ね', 'ne').replace('の', 'no').replace('は', 'ha') text = text.replace('ひ', 'hi').replace('ふ', 'fu').replace('へ', 'he') text = text.replace('ほ', 'ho').replace('ま', 'ma').replace('み', 'mi') text = text.replace('む', 'mu').replace('め', 'me').replace('も', 'mo') text = text.replace('ら', 'ra').replace('り', 'ri').replace('る', 'ru') text = text.replace('れ', 're').replace('ろ', 'ro') text = text.replace('や', 'ya').replace('ゆ', 'yu').replace('よ', 'yo') text = text.replace('わ', 'wa').replace('ゐ', 'wi').replace('を', 'wo') text = text.replace('ゑ', 'we') text = _convert(text, KANA2HEP) while 'っ' in text: text = list(text) tsu_pos = text.index('っ') if len(text) <= tsu_pos + 1: return ''.join(text[:-1]) + 'xtsu' text[tsu_pos] = text[tsu_pos + 1] text = ''.join(text) return text
[docs]def alphabet2kana(text): """Convert alphabets to Hiragana Parameters ---------- text : str Alphabets string. Return ------ str Hiragana string. Examples -------- >>> print(jaconv.alphabet2kana('mamisan')) まみさん """ text = text.replace('kya', 'きゃ').replace('kyu', 'きゅ').replace('kyo', 'きょ') text = text.replace('gya', 'ぎゃ').replace('gyu', 'ぎゅ').replace('gyo', 'ぎょ') text = text.replace('sha', 'しゃ').replace('shu', 'しゅ').replace('sho', 'しょ') text = text.replace('zya', 'じゃ').replace('zyu', 'じゅ').replace('zyo', 'じょ') text = text.replace('zyi', 'じぃ').replace('zye', 'じぇ') text = text.replace('ja', 'じゃ').replace('ju', 'じゅ').replace('jo', 'じょ') text = text.replace('jya', 'じゃ').replace('jyu', 'じゅ').replace('jyo', 'じょ') text = text.replace('cha', 'ちゃ').replace('chu', 'ちゅ').replace('cho', 'ちょ') text = text.replace('tya', 'ちゃ').replace('tyu', 'ちゅ').replace('tyo', 'ちょ') text = text.replace('nya', 'にゃ').replace('nyu', 'にゅ').replace('nyo', 'にょ') text = text.replace('hya', 'ひゃ').replace('hyu', 'ひゅ').replace('hyo', 'ひょ') text = text.replace('mya', 'みゃ').replace('myu', 'みゅ').replace('myo', 'みょ') text = text.replace('rya', 'りゃ').replace('ryu', 'りゅ').replace('ryo', 'りょ') text = text.replace('bya', 'びゃ').replace('byu', 'びゅ').replace('byo', 'びょ') text = text.replace('pya', 'ぴゃ').replace('pyu', 'ぴゅ').replace('pyo', 'ぴょ') text = text.replace('oh', 'おお') text = text.replace('ga', 'が').replace('gi', 'ぎ').replace('gu', 'ぐ') text = text.replace('ge', 'げ').replace('go', 'ご').replace('za', 'ざ') text = text.replace('ji', 'じ').replace('zu', 'ず').replace('ze', 'ぜ') text = text.replace('zo', 'ぞ').replace('da', 'だ').replace('ji', 'ぢ').replace('di', 'ぢ') text = text.replace('va', 'ゔぁ').replace('vi', 'ゔぃ').replace('vu', 'ゔ') text = text.replace('ve', 'ゔぇ').replace('vo', 'ゔぉ').replace('vya', 'ゔゃ') text = text.replace('vyi', 'ゔぃ').replace('vyu', 'ゔゅ').replace('vye', 'ゔぇ') text = text.replace('vyo', 'ゔょ') text = text.replace('zu', 'づ').replace('de', 'で').replace('do', 'ど') text = text.replace('ba', 'ば').replace('bi', 'び').replace('bu', 'ぶ') text = text.replace('be', 'べ').replace('bo', 'ぼ').replace('pa', 'ぱ') text = text.replace('pi', 'ぴ').replace('pu', 'ぷ').replace('pe', 'ぺ') text = text.replace('po', 'ぽ').replace('dha', 'でゃ').replace('dhi', 'でぃ') text = text.replace('dhu', 'でゅ').replace('dhe', 'でぇ').replace('dho', 'でょ') text = text.replace('ka', 'か').replace('ki', 'き').replace('ku', 'く') text = text.replace('ke', 'け').replace('ko', 'こ').replace('sa', 'さ') text = text.replace('shi', 'し').replace('su', 'す').replace('se', 'せ') text = text.replace('so', 'そ').replace('ta', 'た').replace('chi', 'ち') text = text.replace('tsu', 'つ').replace('te', 'て').replace('to', 'と') text = text.replace('na', 'な').replace('ni', 'に').replace('nu', 'ぬ') text = text.replace('ne', 'ね').replace('no', 'の').replace('ha', 'は') text = text.replace('hi', 'ひ').replace('fu', 'ふ').replace('he', 'へ') text = text.replace('ho', 'ほ').replace('ma', 'ま').replace('mi', 'み') text = text.replace('mu', 'む').replace('me', 'め').replace('mo', 'も') text = text.replace('ra', 'ら').replace('ri', 'り').replace('ru', 'る') text = text.replace('re', 'れ').replace('ro', 'ろ') text = text.replace('ya', 'や').replace('yu', 'ゆ').replace('yo', 'よ') text = text.replace('wa', 'わ').replace('wi', 'ゐ').replace('we', 'ゑ') text = text.replace('wo', 'を') text = text.replace('nn', 'ん').replace('tu', 'つ').replace('hu', 'ふ') text = text.replace('fa', 'ふぁ').replace('fi', 'ふぃ').replace('fe', 'ふぇ') text = text.replace('fo', 'ふぉ').replace('-', 'ー') text = _convert(text, HEP2KANA) ret = [] for (i, char) in enumerate(text): if char in consonants: char = 'っ' ret.append(char) return ''.join(ret)