jreadability
jreadability
jReadability is a python package that can be used to score the readability of a given Japanese text.
This package is an unofficial python implementation of the readability model developed by Jae-ho Lee and Yoichiro Hasebe in "Readability measurement of Japanese texts based on levelled corpora."
1""" 2jreadability 3------------ 4 5jReadability is a python package that can be used to score the readability of a given Japanese text. 6 7This package is an unofficial python implementation of the readability model developed by Jae-ho Lee and Yoichiro Hasebe in "Readability measurement of Japanese texts based on levelled corpora." 8""" 9 10from .jreadability import compute_readability 11 12__all__ = ["compute_readability"]
def
compute_readability(text: str, tagger: Optional[fugashi.fugashi.Tagger] = None) -> float:
14def compute_readability(text: str, tagger: Optional[Tagger] = None) -> float: 15 """ 16 Computes the readability of a Japanese text. 17 18 Args: 19 text (str): The text to be scored. 20 tagger (Optional[Tagger]): The fugashi parser used to parse the text. 21 22 Returns: 23 float: A float representing the readability score of the text. 24 """ 25 26 if tagger is None: 27 # initialize mecab parser 28 tagger = Tagger() 29 30 doc = tagger(text) 31 32 def split_japanese_sentences(doc: List[UnidicNode]) -> List[List[UnidicNode]]: 33 """ 34 Helper function that breaks the parsed text into lists of sentences. 35 """ 36 37 sentences = [] 38 current_sentence = [] 39 for token in doc: 40 current_sentence.append(token) 41 42 if token.surface in ("。", "?", "!", "."): 43 sentences.append(current_sentence) 44 current_sentence = [] 45 46 # if there's any leftover sentence that doesn't end with sentence-ending punctuation 47 if current_sentence: 48 sentences.append(current_sentence) 49 50 return sentences 51 52 # first, compute mean sentence length (in words, not characters) 53 sentences = split_japanese_sentences(doc) 54 55 sentence_lengths = [] 56 for sentence_doc in sentences: 57 words_per_sentence = len(sentence_doc) 58 sentence_lengths.append(words_per_sentence) 59 60 mean_length_of_sentence = sum(sentence_lengths) / len(sentences) 61 62 # next, compute percentage of kango, wago, verbs and particles 63 num_kango = 0 64 num_wago = 0 65 num_verbs = 0 66 num_particles = 0 67 for token in doc: 68 goshu = token.feature.goshu # goshu (語種) is the word's origin 69 pos1 = token.feature.pos1 70 pos2 = token.feature.pos2 71 72 if goshu == "漢": # 'kan', meaning chinese 73 num_kango += 1 74 elif goshu == "和": # 'wa', meaning japanese 75 num_wago += 1 76 77 if ( 78 pos1 == "動詞" and pos2 != "非自立可能" 79 ): # 'doushi', meaning verb; but not certain verbs like あり in あります 80 num_verbs += 1 81 elif pos1 == "助詞": # 'joshi', meaning particles 82 num_particles += 1 83 84 percentage_of_kango = 100.0 * num_kango / len(doc) 85 percentage_of_wago = 100.0 * num_wago / len(doc) 86 percentage_of_verbs = 100.0 * num_verbs / len(doc) 87 percentage_of_particles = 100.0 * num_particles / len(doc) 88 89 readability_score = ( 90 mean_length_of_sentence * -0.056 91 + percentage_of_kango * -0.126 92 + percentage_of_wago * -0.042 93 + percentage_of_verbs * -0.145 94 + percentage_of_particles * -0.044 95 + 11.724 96 ) 97 98 return readability_score
Computes the readability of a Japanese text.
Args: text (str): The text to be scored. tagger (Optional[Tagger]): The fugashi parser used to parse the text.
Returns: float: A float representing the readability score of the text.