Home   Blog  

Understand readability of LLM output using numerical scores

Nov 14, 2025

This techniques doesn’t tell whether the answer provided by the LLM is correct or not. That is a job for an “evaluator”.

This technique helps understand the how easy or difficult the answer generated by the LLM is to read.

Let’s assume we are using OpenAI Agents SDK, and we create three mathematical tools for you agent. The agent will use these tools to determine readbility scores.

The tools in this example are:

1️⃣ Coleman-Liau Index

2️⃣ Flesch Reading Ease Score

3️⃣ Gunning-Fog Index

The score range used in this example ensures the simplified content can be easily read and understood at college-education level. The scores need to be adjusted as per the intended audience.

✨ Tool implementation:

# file: tools.py
from utils import count_words, count_sentences, count_letters_and_digits, count_syllables_in_text, count_complex_words
from agents import function_tool


@function_tool(name_override="coleman_liau_index_score")
def compute_coleman_liau_index_score(text: str) -> float:
    """
    Compute the Coleman-Liau index score for the given text.
    """
    # Ref: https://en.wikipedia.org/wiki/Coleman%E2%80%93Liau_index
    L = count_letters_and_digits(text) / count_words(text) * 100
    S = count_sentences(text) / count_words(text) * 100 

    CLI = 0.0588 * L - 0.296 * S - 15.8

    return CLI


@function_tool(name_override="flesch_reading_ease_score")
def compute_flesch_reading_ease_score(text: str) -> float:
    """
    Compute the Flesch Reading Ease score for the given text.
    """
    # Ref: https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests
    score = 206.835 - (1.015 * (count_words(text) / count_sentences(text))) - (84.6 * (count_syllables_in_text(text) / count_words(text)))

    return score


@function_tool(name_override="gunning_fog_index_score")
def compute_gunning_fog_index_score(text: str) -> float:
    """
    Compute the Gunning Fog Index score for the given text.
    """
    # Ref: https://en.wikipedia.org/wiki/Gunning_fog_index
    score = 0.4 * ((count_words(text) / count_sentences(text)) + (100 * (count_complex_words(text) / count_words(text))))

    return score


ACCEPTABLE_MARGIN = 0.05 # +/- 5% margin for readability scores


@function_tool(name_override="is_coleman_liau_index_score_acceptable")
def is_coleman_liau_index_score_acceptable(score: float) -> bool:       
    """
    Check if the Coleman-Liau index score is acceptable.

    Arguments:
    score: The Coleman-Liau index score to check.

    Returns:
    bool: True if the score is 17 or lower, False otherwise.
    """
    # Ref: https://en.wikipedia.org/wiki/Coleman%E2%80%93Liau_index
    acceptable = score <= (17.0 + (17.0 * ACCEPTABLE_MARGIN))
    return acceptable


@function_tool(name_override="is_flesch_reading_ease_score_acceptable")
def is_flesch_reading_ease_score_acceptable(score: float) -> bool:          
    """
    Check if the Flesch Reading Ease score is acceptable.
    
    Arguments:
    score: The Flesch Reading Ease score to check.

    Returns:
    bool: True if the score is within the acceptable range, False otherwise.
    """
    # Ref: https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests
    acceptable = (20.0 - (20.0 * ACCEPTABLE_MARGIN)) <= score <= (40.0 + (40.0 * ACCEPTABLE_MARGIN))
    return acceptable


@function_tool(name_override="is_gunning_fog_index_score_acceptable")
def is_gunning_fog_index_score_acceptable(score: float) -> bool:    
    """
    Check if the Gunning Fog Index score is acceptable.

    Arguments:
    score: The Gunning Fog Index score to check.    

    Returns:
    bool: True if the score is 17 or lower, False otherwise.
    """
    # Ref: https://en.wikipedia.org/wiki/Gunning_fog_index
    acceptable = score <= (17.0 + (17.0 * ACCEPTABLE_MARGIN))
    return acceptable

✨ Helper utility functions:

# file: utils.py

def count_words(text: str) -> int:
    """
    Count the number of words in a given text.
    """
    return len(text.split())


def count_sentences(text: str) -> int:
    """
    Count the number of sentences in a given text.
    """
    # A simple heuristic to count sentences based on punctuation
    return text.count('.') + text.count('!') + text.count('?') + text.count(';') + text.count(':')


def count_letters_and_digits(text: str) -> int:
    """
    Count the number of letters and digits in a given text.
    """
    return sum(c.isalnum() for c in text)


def count_syllables(word: str) -> int:
    """
    Count the number of syllables in a word.
    A simple heuristic is used here, which may not be perfect.

    (copied this idea from a StackOverflow answer)
    """
    vowels = "aeiouy"
    count = 0
    word = word.lower().strip(".:;?!")
    
    if len(word) <= 3:
        return 1  # Short words are assumed to have one syllable
    
    for i in range(len(word)):
        if word[i] in vowels:
            if i == 0 or word[i - 1] not in vowels:
                count += 1
    
    # Subtract one for silent 'e' at the end
    if word.endswith('e'):
        count -= 1
    
    return max(count, 1)  # Ensure at least one syllable


def count_syllables_in_text(text: str) -> int:
    """
    Count the total number of syllables in a given text.
    """
    words = text.split()
    return sum(count_syllables(word) for word in words)


def is_complex_word(word: str) -> bool:
    """
    Determine if a word is complex based on its syllable count.
    A word is considered complex if it has more than 2 syllables.
    
    Adjust this value based on what complex means for you.
    """
    return count_syllables(word) > 2 


def count_complex_words(text: str) -> int:
    """
    Count the number of complex words in a given text.
    """
    words = text.split()
    return sum(1 for word in words if is_complex_word(word))   

✨ Prompts:

complex_text_finder_agent_instructions = f"""
You are a language specialist skilled at finding complex sentences in a text given to you.
Your job is to analyse the text given to you in <text> </text> XML tags and highlight complex sentences.
Use each tool exactly once.

**Your Task:**
Identify and highlight sentences or group of sentences in the text provided in <text> </text> XML tags, that contribute to high complexity, specifically:
    - Long sentences and/or dense paragraphs  
    - Use of complex, technical, or non-accessible vocabulary
    - Use of jargons, acronyms that is missing explanation for reader
    - Use Coleman-Liau index score and highlight text that with score higher than 17
    - Use Flesch Reading Ease and highlight text that score below 20 or above 40
    - Use Gunning Fog Index and highlight text that score higher than 17

**Additionally:**
Output in markdown format and wrap complicated sentences you identify inside <complex> </complex> tags.
Remove all other XML tags from the output except the <complex> </complex> tags.
Do not add any explanation or comments or XML or markdown labels, just return the text in markdown with highlighted complex sentences.

If there are comments given by the reviewer inside <reviewer-comments> </reviewer-comments> XML tags, use the reviewer comments to improve your output.

<text>{text}</text>
"""


simplifier_agent_instructions = f"""
You are a language specialist, skilled in simplifying complex text given to you.

**Your task**
    - Simplify the sections wrapped in <complex> </complex> tags inside the text given inside <text> </text> XML tags below.
    - Use the guidelines provided and maintain the text's original meaning and context 
    - Use bullet points where necessary to make the text more readable, use bold and italics to emphasize important points
    - Simplified text must adhere to the rules and corresponding capabilities provided in <rules> </rules> XML tags.

**Guidelines for simplification:**
    - Use clear and concise language, maintaining the original meaning
    - Avoid jargon, complex vocabulary and sentence structures
    - Break down long sentences into shorter ones
    - Represent a good target for general audiences, including high school students and adults who prefer straightforward language
    - Simplified version should be in plain english (college graduate level)
    - Must be suitable for audience with high school level reading comprehension
    - Use reviewer comments, if any, in the <reviewer-comments> </reviewer-comments> XML tags to improve output

Output in markdown format without markdown labelling, and in around {num_of_words} words.

<rules>{rules_in_xml}</rules>

<text>{text}</text>

<reviewer-comments>{reviewer_comments_if_any}</reviewer-comments>
"""


reviewer_agent_instructions = f"""
You are a language specialist skilled in analyzing readability text provided to you.

Your task is to adhere to the following output requirements for the text supplied to you in markdown format inside <markdown-text> </markdown-text> XML tags below.
Use each tool exactly once to compute the Coleman-Liau Index score, Flesch Reading Ease score, and Gunning Fog Index score for the text.

**Output Requirements:**
The following readability index scores against these **target standards** must be met:  
    - **Coleman-Liau Index**: This score must be 17 or lower  
    - **Flesch Reading Ease**: This score must lie between 20 and 40  
    - **Gunning Fog Index**: This score must be 17 or lower

Use the tools provided for specific score checks to determine if the scores meet the target standards.

If the text does not meet the standards, provide reviewer comments in 2-3 sentences for improvements. 
If you think no further simplification is needed, then pass the review.
Return the exact text reviewed by you after removing all XML tags or markdown labels if any.

<markdown-text>{markdown_text}</markdown-text>
"""

✨ Sample use:

from agents import Agent
from pydantic import BaseModel, Field

deployment = "gpt-4.1-mini"


complex_text_finder_agent = Agent(
    name="ComplexTextFinderAgent",
    instructions=complex_text_finder_agent_instructions, 
    tools=[
        compute_coleman_liau_index_score,
        compute_flesch_reading_ease_score,
        compute_gunning_fog_index_score
    ],
    model=deployment
)


simplifier_agent = Agent(
    name="SimplifierAgent",
    instructions=simplifier_agent_instructions, 
    model=deployment
)


class ReviewResult(BaseModel):
    reviewed_text: str = Field("The exact text that was reviewed by the AI, without any XML tags")
    readability_scores: ReadabilityScores = Field(description="Readability scores for the content")
    reviewer_comments: str = Field("Reviewer comments on the content's readability and complexity telling what can be improved further")
    reviewer_passed: bool = Field(description="True if the content is readable and simple enough, False otherwise")


reviewer_agent = Agent(
    name="ReviewerAgent",
    instructions=reviewer_agent_instructions, 
    tools=[
        compute_coleman_liau_index_score,
        compute_flesch_reading_ease_score,
        compute_gunning_fog_index_score,
        is_coleman_liau_index_score_acceptable,
        is_flesch_reading_ease_score_acceptable,
        is_gunning_fog_index_score_acceptable
    ],
    output_type=ReviewResult,
    model=deployment
)

Based on the reviewer_passed flag in the structured output of the reviewer’s result, you can continue the agent loop and simpify again, and continue until the review passes, or until you have exhausted max turns for the agent loop. You need to decide how many turns you will allow the agent loop for the task.