jreadability

jreadability

jReadability is a python package that can be used to score the readability of a given Japanese text.

This package is an unofficial python implementation of the readability model developed by Jae-ho Lee and Yoichiro Hasebe in "Readability measurement of Japanese texts based on levelled corpora."

 1"""
 2jreadability
 3------------
 4
 5jReadability is a python package that can be used to score the readability of a given Japanese text.
 6
 7This package is an unofficial python implementation of the readability model developed by Jae-ho Lee and Yoichiro Hasebe in "Readability measurement of Japanese texts based on levelled corpora."
 8"""
 9
10from .jreadability import compute_readability
11
12__all__ = ["compute_readability"]
def compute_readability(text: str, tagger: Optional[fugashi.fugashi.Tagger] = None) -> float:
14def compute_readability(text: str, tagger: Optional[Tagger] = None) -> float:
15    """
16    Computes the readability of a Japanese text.
17
18    Args:
19        text (str): The text to be scored.
20        tagger (Optional[Tagger]): The fugashi parser used to parse the text.
21
22    Returns:
23        float: A float representing the readability score of the text.
24    """
25
26    if tagger is None:
27        # initialize mecab parser
28        tagger = Tagger()
29
30    doc = tagger(text)
31
32    def split_japanese_sentences(doc: List[UnidicNode]) -> List[List[UnidicNode]]:
33        """
34        Helper function that breaks the parsed text into lists of sentences.
35        """
36
37        sentences = []
38        current_sentence = []
39        for token in doc:
40            current_sentence.append(token)
41
42            if token.surface in ("。", "?", "!", "."):
43                sentences.append(current_sentence)
44                current_sentence = []
45
46        # if there's any leftover sentence that doesn't end with sentence-ending punctuation
47        if current_sentence:
48            sentences.append(current_sentence)
49
50        return sentences
51
52    # first, compute mean sentence length (in words, not characters)
53    sentences = split_japanese_sentences(doc)
54
55    sentence_lengths = []
56    for sentence_doc in sentences:
57        words_per_sentence = len(sentence_doc)
58        sentence_lengths.append(words_per_sentence)
59
60    mean_length_of_sentence = sum(sentence_lengths) / len(sentences)
61
62    # next, compute percentage of kango, wago, verbs and particles
63    num_kango = 0
64    num_wago = 0
65    num_verbs = 0
66    num_particles = 0
67    for token in doc:
68        goshu = token.feature.goshu  # goshu (語種) is the word's origin
69        pos1 = token.feature.pos1
70        pos2 = token.feature.pos2
71
72        if goshu == "漢":  # 'kan', meaning chinese
73            num_kango += 1
74        elif goshu == "和":  # 'wa', meaning japanese
75            num_wago += 1
76
77        if (
78            pos1 == "動詞" and pos2 != "非自立可能"
79        ):  # 'doushi', meaning verb; but not certain verbs like あり in あります
80            num_verbs += 1
81        elif pos1 == "助詞":  # 'joshi', meaning particles
82            num_particles += 1
83
84    percentage_of_kango = 100.0 * num_kango / len(doc)
85    percentage_of_wago = 100.0 * num_wago / len(doc)
86    percentage_of_verbs = 100.0 * num_verbs / len(doc)
87    percentage_of_particles = 100.0 * num_particles / len(doc)
88
89    readability_score = (
90        mean_length_of_sentence * -0.056
91        + percentage_of_kango * -0.126
92        + percentage_of_wago * -0.042
93        + percentage_of_verbs * -0.145
94        + percentage_of_particles * -0.044
95        + 11.724
96    )
97
98    return readability_score

Computes the readability of a Japanese text.

Args: text (str): The text to be scored. tagger (Optional[Tagger]): The fugashi parser used to parse the text.

Returns: float: A float representing the readability score of the text.