Drill Drill Drill Or: How I Learned to Stop Worrying About My Code and Love Testing

So this is a follow-up to my previous post. At the same time, it’s a rewrite (and maybe even improvement) on the original posts from cnlearn.app. Specifically, this relates to this post.

There’s one thing I changed/improved from the previous post. I started using more SQLAlchemy 1.4/2.0 notation after attending a great tutorial at Python Web Conference. If I don’t forget, I will update this in June or so when all the videos will be freely available online.

from sqlalchemy import Column, Integer, String, JSON
from sqlalchemy.orm import registry
from sqlalchemy.sql.expression import null
from sqlalchemy.sql.schema import ForeignKey
from sqlalchemy import create_engine

mapper_registry = registry()


@mapper_registry.mapped
class Word:
    __tablename__ = "words"

    id = Column(Integer, primary_key=True)
    simplified = Column(String(50))
    traditional = Column(String(50))
    pinyin_num = Column(String(100))
    pinyin_accent = Column(String(100))
    pinyin_clean = Column(String(100))
    pinyin_no_spaces = Column(String(100))
    also_written = Column(String(100))
    also_pronounced = Column(String(100))
    classifiers = Column(String(100))
    definitions = Column(String(500))
    frequency = Column(Integer)

    def __repr__(self):
        return f"<Word(simplified='{self.simplified}', pinyin='{self.pinyin_accent}>'"


@mapper_registry.mapped
class Character:
    __tablename__ = "characters"

    id = Column(Integer, primary_key=True)
    character = Column(String(1))
    definition = Column(String(150), nullable=True)
    pinyin = Column(String(50))
    decomposition = Column(String(15))
    etymology = Column(JSON(), nullable=True)
    radical = Column(String(1))
    matches = Column(String(100))
    frequency = Column(Integer)

    def __repr__(self):
        return f"<Character({self.character}, radical='{self.radical})>"

engine = create_engine("sqlite:///dictionary.db")
with engine.begin() as connection:
    mapper_registry.metadata.create_all(connection)

But didn’t you say this is about testing? Indeed it is. Let’s have a look. I’m using Pytest for testing because it has Python and test in it so that’s a good enough reason. But really I just like the way tests can be parametrised so avoids rewriting many things.

from typing import Dict, Optional

import pytest
from sqlalchemy import select
from sqlalchemy.engine import create_engine
from sqlalchemy.orm import Session, sessionmaker

from .db import Character, Word
from .pinyin_utils import convert_pinyin, last_vowel


@pytest.fixture
def db_session():
    """
    Creates and returns a database session.
    """
    engine = create_engine("sqlite:///dictionary.db", future=True)
    Session = sessionmaker(bind=engine, future=True)
    session: Session = Session()
    return session


@pytest.fixture
def shi_character(db_session: Session):
    """
    This fixture looks for the 是 character in the simplified column of the SQLite
    database. There are two dictionary entries for it. It also looks at all the
    expressions containing 是 in them.
    """
    shi_entries = db_session.execute(select(Word).where(Word.simplified == "是")).all()
    shi_containing_entries = db_session.execute(
        select(Word).where(Word.simplified.contains("是"))
    ).all()
    return (shi_entries, shi_containing_entries)


def test_number_entries_db(db_session: Session):
    """
    This tests counts the number of entries and unique (simplified)
    entries in the CEDICT file.
    """
    # session = db_session
    total_entries: int = len(db_session.execute(select(Word)).all())
    assert total_entries == 116134


def test_number_shi_entries(shi_character):
    """
    This tests the individual 是 entries and the entries containing
    that character.
    """
    individual_shi, group_shi = shi_character
    assert len(individual_shi) == 2
    assert len(group_shi) == 140


def test_individual_shi_entries(shi_character):
    """
    This tests the individual 是 entries.
    """
    individual_shi = shi_character[0]
    shi_1 = individual_shi[0]
    shi_2 = individual_shi[1]
    assert shi_1.Word.traditional == "是"
    assert shi_2.Word.traditional == "昰"
    assert shi_1.Word.pinyin_accent == "shì" and shi_2.Word.pinyin_accent == "shì"
    assert shi_1.Word.definitions == "is; are; am; yes; to be"
    assert shi_2.Word.definitions == "variant of 是(shì); (used in given names)"


@pytest.mark.parametrize(
    "simplified,traditional,pinyin_accent,definitions,classifiers,frequency",
    [
        ("我们", "我們", "wǒ men", "we; us; ourselves; our", "", 283794),
        ("越来越", "越來越", "yuè lái yuè", "more and more", "", 13386),
        (
            "不好意思",
            "不好意思",
            "bù hǎo yì si",
            "to feel embarrassed; to find it embarrassing; to be sorry (for inconveniencing sb)",
            "",
            3667,
        ),
        ("简直", "簡直", "jiǎn zhí", "simply; at all; practically", "", 6776),
        ("风景", "風景", "fēng jǐng", "scenery; landscape", "个", 3572),
        ("人", "人", "rén", "man; person; people", "个; 位", 373857),
        (
            "一掬同情之泪",
            "一掬同情之淚",
            "yī jū tóng qíng zhī lèi",
            "to shed tears of sympathy (idiom)",
            "",
            6639327,
        ),
        (
            "3C",
            "3C",
            "sān C",
            "abbr. for computers, communications, and consumer electronics; China Compulsory Certificate (CCC)",
            "",
            6639327,
        ),
        (
            "USB记忆棒",
            "USB記憶棒",
            "U S B jì yì bàng",
            "USB flash drive; see also 闪存盘(shǎn cún pán)",
            "",
            6639327,
        ),
        (
            "一哄而散",
            "一哄而散",
            "yī hōng ér sàn",
            "to disperse in confusion (idiom)",
            "",
            6639327,
        ),
        (
            "不畏强权",
            "不畏強權",
            "bù wèi qiáng quán",
            "not to submit to force (idiom); to defy threats and violence",
            "",
            6639327,
        ),
    ],
)
def test_words(
    db_session: Session,
    simplified: str,
    traditional: str,
    pinyin_accent: str,
    definitions: str,
    classifiers: str,
    frequency: int,
):
    word = db_session.execute(select(Word).where(Word.simplified == simplified)).first()
    assert word.Word.simplified == simplified
    assert word.Word.traditional == traditional
    assert word.Word.pinyin_accent == pinyin_accent
    assert word.Word.definitions == definitions
    assert word.Word.classifiers == classifiers
    assert word.Word.frequency == frequency


@pytest.mark.parametrize(
    "word,number", [("woah", 2), ("hao", 2), ("kong", 1), ("3 P", None), ("456", None)]
)
def test_last_vowel(word: str, number: Optional[int]):
    """
    Tests the last vowel function.
    """
    assert last_vowel(word) == number


@pytest.mark.xfail(raises=ValueError)
def test_convert_pinyin_exceptions_integer():
    """
    Tests if passing a string or list of strings (such as an integer)
    will raise a ValueError.
    a ValueError.
    """
    convert_pinyin(5, "accent")


@pytest.mark.xfail(raises=ValueError)
def test_convert_pinyin_exceptions_wrong_flag():
    """
    Test if passing a flag not `accent` or `clean` raises a ValueError.
    """
    convert_pinyin(5, "country")


@pytest.mark.parametrize(
    "simplified,pinyin,decomposition,etymology,radical",
    [
        (
            "好",
            "hǎo",
            "⿰女子",
            {
                "type": "ideographic",
                "hint": "A woman 女 with a son 子",
            },
            "女",
        ),
        (
            "戴",
            "dài",
            "⿻⿱十異戈",
            None,
            "戈",
        ),
    ],
)
def test_characters(
    db_session: Session,
    simplified: str,
    pinyin: str,
    decomposition: str,
    etymology: Dict,
    radical: str,
):
    character = db_session.execute(
        select(Character).where(Character.character == simplified)
    ).first()
    assert character.Character.character == simplified
    assert character.Character.pinyin == pinyin
    assert character.Character.decomposition == decomposition
    assert character.Character.etymology == etymology
    assert character.Character.radical == radical

Ok now let’s go one by one.

from typing import Dict, Optional

import pytest
from sqlalchemy import select
from sqlalchemy.engine import create_engine
from sqlalchemy.orm import Session, sessionmaker

from .db import Character, Word
from .pinyin_utils import convert_pinyin, last_vowel

I first import the various things I need: for typing, for connecting to the database through SQLAlchemy, the pinyin utility functions I used and will test, the database models and pytest. What about the next block of code?

@pytest.fixture
def db_session():
    """
    Creates and returns a database session.
    """
    engine = create_engine("sqlite:///dictionary.db", future=True)
    Session = sessionmaker(bind=engine, future=True)
    session: Session = Session()
    return session

What’s the @ doing there? Why is there nothing before the @ in that email? (terrible joke I know). In Python, the @ are used to precede a decorator. Pytest uses decorators to mark functions for several purposes including setting them as fixtures (to initialise test functions by setting up services, states or other operating environments) and marking tests (expected to fail perhaps). In this case, the @pytest.fixture marks the db_session function as a Pytest fixture. Then I can use db_session as an argument in test functions where it will simply act as the dictionary that was loaded.

What does the next fixure do?

@pytest.fixture
def shi_character(db_session: Session):
    """
    This fixture looks for the 是 character in the simplified column of the SQLite
    database. There are two dictionary entries for it. It also looks at all the
    expressions containing 是 in them.
    """
    shi_entries = db_session.execute(select(Word).where(Word.simplified == "是")).all()
    shi_containing_entries = db_session.execute(
        select(Word).where(Word.simplified.contains("是"))
    ).all()
    return (shi_entries, shi_containing_entries)

In my original version of CNLearn, I used Python defaultdict to hold the data (but then decided against it for several reasons). Still, there are some tests that were run specifically to verify that I implemented certain features correctly. One of the errors I used to get was because of the 是 character. This fixtures relates to that.

Now onto our actual tests after all this wait,

The fixtures are all gone and tests there were eight.

def test_number_entries_db(db_session: Session):
    """
    This tests counts the number of entries and unique (simplified)
    entries in the CEDICT file.
    """
    # session = db_session
    total_entries: int = len(db_session.execute(select(Word)).all())
    assert total_entries == 116134

This test simply checked that the number of rows in the database matches what was in the CEDICT file.

The entries were all there, let us thank heaven,

One test is done and then there were seven.

def test_number_shi_entries(shi_character):
    """
    This tests the individual 是 entries and the entries containing
    that character.
    """
    individual_shi, group_shi = shi_character
    assert len(individual_shi) == 2
    assert len(group_shi) == 140

The current test tests if the words and expressions containing 是 are all there.

I hope that by now you get pytest’s tricks,

Another test is done and then there were six.

def test_individual_shi_entries(shi_character):
    """
    This tests the individual 是 entries.
    """
    individual_shi = shi_character[0]
    shi_1 = individual_shi[0]
    shi_2 = individual_shi[1]
    assert shi_1.Word.traditional == "是"
    assert shi_2.Word.traditional == "昰"
    assert shi_1.Word.pinyin_accent == "shì" and shi_2.Word.pinyin_accent == "shì"
    assert shi_1.Word.definitions == "is; are; am; yes; to be"
    assert shi_2.Word.definitions == "variant of 是(shì); (used in given names)"

We had the correct number of 是 containing entries in the database. This test adds upon that and tests whether the pinyin_accent, definitions, were all there.

These tests shouldn’t bore you, you should still feel alive,

Our 是 got so verified and then there were five.

@pytest.mark.parametrize(
    "simplified,traditional,pinyin_accent,definitions,classifiers,frequency",
    [
        ("我们", "我們", "wǒ men", "we; us; ourselves; our", "", 283794),
        ("越来越", "越來越", "yuè lái yuè", "more and more", "", 13386),
        (
            "不好意思",
            "不好意思",
            "bù hǎo yì si",
            "to feel embarrassed; to find it embarrassing; to be sorry (for inconveniencing sb)",
            "",
            3667,
        ),
        ("简直", "簡直", "jiǎn zhí", "simply; at all; practically", "", 6776),
        ("风景", "風景", "fēng jǐng", "scenery; landscape", "个", 3572),
        ("人", "人", "rén", "man; person; people", "个; 位", 373857),
        (
            "一掬同情之泪",
            "一掬同情之淚",
            "yī jū tóng qíng zhī lèi",
            "to shed tears of sympathy (idiom)",
            "",
            6639327,
        ),
        (
            "3C",
            "3C",
            "sān C",
            "abbr. for computers, communications, and consumer electronics; China Compulsory Certificate (CCC)",
            "",
            6639327,
        ),
        (
            "USB记忆棒",
            "USB記憶棒",
            "U S B jì yì bàng",
            "USB flash drive; see also 闪存盘(shǎn cún pán)",
            "",
            6639327,
        ),
        (
            "一哄而散",
            "一哄而散",
            "yī hōng ér sàn",
            "to disperse in confusion (idiom)",
            "",
            6639327,
        ),
        (
            "不畏强权",
            "不畏強權",
            "bù wèi qiáng quán",
            "not to submit to force (idiom); to defy threats and violence",
            "",
            6639327,
        ),
    ],
)
def test_words(
    db_session: Session,
    simplified: str,
    traditional: str,
    pinyin_accent: str,
    definitions: str,
    classifiers: str,
    frequency: int,
):
    word = db_session.execute(select(Word).where(Word.simplified == simplified)).first()
    assert word.Word.simplified == simplified
    assert word.Word.traditional == traditional
    assert word.Word.pinyin_accent == pinyin_accent
    assert word.Word.definitions == definitions
    assert word.Word.classifiers == classifiers
    assert word.Word.frequency == frequency

Ok this is a long one. What is happening? We are using a new decorator: @pytest.mark.parametrize. What does it do? It parametrizes test functions. Let’s say I want to test that for a given word, the database information matches what I would expect from the CEDICT file. And I want to do that for another one. And for another one. And for another one. I would have to write many tests that are essentially that have the same structure, assert statements, etc. The only thing that would be different is the actual values. Parametrizing comes to the rescue. What I did in this current test was to first list the parameters of the function as comma separated values in a string, "simplified,traditional,pinyin_accent,definitions,classifiers,frequency", followed by a list of tuples [(), (), (), ...] where each tuple contained the values corresponding to each of the parameters detailed in the string aforementioned. Then the actual test function includes those 6 parameters but it also calls upon the db_session fixture. In the actual test, the statements are not tied to a specific character/word but to the parameters.

Everybody loves parameters, they want to add more,

Our tests are parametrised now and then there were four.

In the next function we are testing our last_vowel function.

@pytest.mark.parametrize(
    "word,number", [("woah", 2), ("hao", 2), ("kong", 1), ("3 P", None), ("456", None)]
)
def test_last_vowel(word: str, number: Optional[int]):
    """
    Tests the last vowel function.
    """
    assert last_vowel(word) == number

I am using parameters again. Not much else to say about this.

By now you love testing, we are on a spree,

The vowels were found and then there were three.

@pytest.mark.xfail(raises=ValueError)
def test_convert_pinyin_exceptions_integer():
    """
    Tests if passing a string or list of strings (such as an integer)
    will raise a ValueError.
    a ValueError.
    """
    convert_pinyin(5, "accent")

A new decorator? Yes please. Oh! A new decorator. What does it do? It means that we expect this test to fail. If I want to convert a number to pinyin accent, that makes no sense and surely it should fail. And the test fails but it “passes” because I said it should fail. Is that test truly necessary or am I just playing around with pytest options? Yes..

You’re improving your knowledge, the decorator was new,

This test was a failure and then there were two.

@pytest.mark.xfail(raises=ValueError)
def test_convert_pinyin_exceptions_wrong_flag():
    """
    Test if passing a flag not `accent` or `clean` raises a ValueError.
    """
    convert_pinyin(5, "country")

This one tests whether passing an integer and calling convert_pinyin with the wrong flag will fail. It does.

You think you’re an expert and that we are done,

I used the wrong country and then there was one.

Oh, I haven’t tested the characters table in the database. Let’s do so with some parametrizing.

@pytest.mark.parametrize(
    "simplified,pinyin,decomposition,etymology,radical",
    [
        (
            "好",
            "hǎo",
            "⿰女子",
            {
                "type": "ideographic",
                "hint": "A woman 女 with a son 子",
            },
            "女",
        ),
        (
            "戴",
            "dài",
            "⿻⿱十異戈",
            None,
            "戈",
        ),
    ],
)
def test_characters(
    db_session: Session,
    simplified: str,
    pinyin: str,
    decomposition: str,
    etymology: Dict,
    radical: str,
):
    character = db_session.execute(
        select(Character).where(Character.character == simplified)
    ).first()
    assert character.Character.character == simplified
    assert character.Character.pinyin == pinyin
    assert character.Character.decomposition == decomposition
    assert character.Character.etymology == etymology
    assert character.Character.radical == radical

This post was so long, we had a good run,

We checked the two tables and then there were none.