import unittest
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


# Helper functions
def _require_success(test_case, condition, failure_message):
    if not condition:
        test_case._testMethodName = failure_message
        test_case.fail()

def _check_success(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message
    else:
        test_case._testMethodName = failure_message
        test_case.fail()


class TestUserCode(unittest.TestCase):
    # Test that everything is imported correctly
    def test_imports(self):
        import user_code
        _require_success(
            self, 
            hasattr(user_code, "word_tokenize"), 
            "`word_tokenize` is not declared"
        )
        _require_success(
            self,
            user_code.word_tokenize == word_tokenize,
            "`word_tokenize` is declared, but it's not `nltk.tokenize.word_tokenize`"
        )
        _require_success(
            self, 
            hasattr(user_code, "stopwords"), 
            "`stopwords` is not declared"
        )
        _require_success(
            self,
            user_code.stopwords == stopwords,
            "`stopwords` is declared, but it's not `nltk.corpus.stopwords`"
        )
        _require_success(
            self, 
            hasattr(user_code, "PorterStemmer"), 
            "`PorterStemmer` is not declared"
        )
        _require_success(
            self,
            user_code.PorterStemmer == PorterStemmer,
            "`PorterStemmer` is declared, but it's not `nltk.stem.PorterStemmer`"
        )
        _require_success(
            self,
            hasattr(user_code, "nltk"),
            "`nltk` is not declared"
        )
        _require_success(
            self,
            user_code.nltk == nltk,
            "`nltk` is declared, but it's not `nltk` library"
        )
        self._testMethodName = "imports are correct"

    # Test that required NLTK resources are downloaded
    def test_nltk_downloads(self):
        import user_code
        _require_success(
            self,
            hasattr(user_code, "nltk"),
            "`nltk` is not declared"
        )
        _require_success(
            self,
            user_code.nltk == nltk,
            "`nltk` is declared, but it's not `nltk` library"
        )
        with open("user_code.py", "r") as f:
            user_code_text = f.read()
            _require_success(
                self,
                re.search(r"nltk *\. *download *\( *'punkt_tab' *\)", user_code_text),
                "Code for downloading `'punkt_tab'` tokenizer shouldn't be modified"
            )
            _require_success(
                self,
                re.search(r"nltk *\. *download *\( *'stopwords' *\)", user_code_text),
                "Code for downloading `'stopwords'` shouldn't be modified"
            )
        self._testMethodName = "NLTK resources are downloaded"

    # Test original text
    def test_text(self):
        import user_code
        expected = "Despite the pouring rain, the overwhelming sense of joy and accomplishment made the day unforgettable!"
        _require_success(
            self, 
            hasattr(user_code, "text"), 
            "`text` is not declared"
        )
        _check_success(
            self,
            user_code.text == expected,
            "`text` contains correct value",
            "`text` shouldn't be modified"
        )

    # Test lowercase conversion
    def test_text_lower(self):
        import user_code
        _require_success(
            self, 
            hasattr(user_code, "text"), 
            "`text` is not declared"
        )
        _require_success(
            self, 
            isinstance(user_code.text, str), 
            "`text` is not a string"
        )
        _require_success(
            self, 
            hasattr(user_code, "text_lower"), 
            "`text_lower` is not declared"
        )
        _check_success(
            self,
            user_code.text_lower == user_code.text.lower(),
            "`text_lower` is computed correctly",
            "`text_lower` is not the lowercase of `text`"
        )

    # Test tokenization of text_lower
    def test_tokens(self):
        import user_code
        _require_success(
            self, 
            hasattr(user_code, "text_lower"), 
            "`text_lower` is not declared"
        )
        _require_success(
            self, 
            isinstance(user_code.text_lower, str), 
            "`text_lower` is not a string"
        )
        _require_success(
            self, 
            hasattr(user_code, "tokens"), 
            "`tokens` is not declared"
        )
        _check_success(
            self,
            user_code.tokens == word_tokenize(user_code.text_lower),
            "`tokens` is computed correctly",
            "`text_lower` is not correctly tokenized into words"
        )

    # Test stop_words set
    def test_stop_words(self):
        import user_code
        _require_success(
            self, 
            hasattr(user_code, "stop_words"), 
            "`stop_words` is not declared"
        )
        _require_success(
            self, 
            isinstance(user_code.stop_words, set), 
            "`stop_words` is not a set"
        )
        _check_success(
            self,
            user_code.stop_words == set(stopwords.words('english')),
            "`stop_words` contains correct English stopwords set",
            "`stop_words` doesn't contain English stopwords"
        )

    # Test filtered_tokens filtering
    def test_filtered_tokens(self):
        import user_code
        _require_success(
            self, 
            hasattr(user_code, "tokens"), 
            "`tokens` is not declared"
        )
        _require_success(
            self, 
            isinstance(user_code.tokens, list), 
            "`tokens` is not a list"
        )
        _require_success(
            self, 
            hasattr(user_code, "stop_words"), 
            "`stop_words` is not declared"
        )
        _require_success(
            self, 
            isinstance(user_code.stop_words, set), 
            "`stop_words` is not a set"
        )
        _require_success(
            self, 
            hasattr(user_code, "filtered_tokens"), 
            "`filtered_tokens` is not declared"
        )
        expected = [tok for tok in user_code.tokens if tok not in user_code.stop_words]
        _check_success(
            self,
            user_code.filtered_tokens == expected,
            "`filtered_tokens` is computed correctly",
            "stop words in `tokens` are not correctly filtered out"
        )

    # Test stemmer instance
    def test_stemmer(self):
        import user_code
        _require_success(
            self, 
            hasattr(user_code, "stemmer"), 
            "`stemmer` is not declared"
        )
        _check_success(
            self,
            isinstance(user_code.stemmer, PorterStemmer),
            "`stemmer` is defined correctly",
            "`stemmer` is not an instance of `PorterStemmer`"
        )

    # Test stemming results
    def test_stemmed_tokens(self):
        import user_code
        _require_success(
            self, 
            hasattr(user_code, "stemmer"), 
            "`stemmer` is not declared"
        )
        _require_success(
            self,
            isinstance(user_code.stemmer, PorterStemmer),
            "`stemmer` is not an instance of `PorterStemmer`"
        )
        _require_success(
            self, 
            hasattr(user_code, "filtered_tokens"), 
            "`filtered_tokens` is not declared"
        )
        _require_success(
            self, 
            isinstance(user_code.filtered_tokens, list), 
            "`filtered_tokens` is not a list"
        )
        _require_success(
            self, 
            hasattr(user_code, "stemmed_tokens"), 
            "`stemmed_tokens` is not declared"
        )
        expected = [user_code.stemmer.stem(tok) for tok in user_code.filtered_tokens]
        _check_success(
            self,
            user_code.stemmed_tokens == expected,
            "`stemmed_tokens` is computed correctly",
            "`filtered_tokens` are not correctly stemmed"
        )

    # Test print statement
    def test_print(self):
        with open("user_code.py", "r") as f:
            user_code_text = f.read()
        _check_success(
            self,
            re.search(r"print *\( *\\"Stemmed tokens:\\" *, *stemmed_tokens *\)", user_code_text),
            "print statement is correct",
            "print statement shouldn't be modified"
        )


if __name__ == '__main__':
    nltk.download('punkt_tab')
    nltk.download('stopwords')
    unittest.main()

test_main.py

Esplora i fondamenti dell'Elaborazione del Linguaggio Naturale (NLP) apprendendo le tecniche essenziali di pre-elaborazione del testo e i metodi per rappresentare i dati testuali. Acquisisci esperienza pratica con gli strumenti utilizzati per pulire, analizzare e interpretare le informazioni testuali. Sviluppa le competenze necessarie per trasformare il linguaggio grezzo in informazioni strutturate, ponendo solide basi per applicazioni avanzate in intelligenza artificiale e machine learning.

Approfondimento sui fondamenti della pre-elaborazione del testo per preparare il testo grezzo all'analisi. Apprendimento delle tecniche di tokenizzazione, filtraggio delle stop word e personalizzazione della tokenizzazione tramite espressioni regolari.

Scopri come le parole possono essere ridotte alle loro forme base utilizzando stemming e lemmatizzazione. Apprendi il part-of-speech tagging per arricchire il testo con il contesto grammaticale e applica la lemmatizzazione basata sul POS.

Scopri come il testo può essere rappresentato con numeri utilizzando modelli di spazio vettoriale. Acquisisci esperienza pratica implementando e personalizzando due modelli di spazio vettoriale popolari: bag of words e TF-IDF.

Acquisire una solida comprensione dei word embeddings e di come catturano il significato semantico. Esplorazione delle architetture CBoW e Skip-gram utilizzate in Word2Vec, con implementazione autonoma.

Sfida: Stemming

Soluzione