Тепер необхідно побудувати модель регресії на реальному прикладі. У вас є файл `houses_simple.csv`, який містить інформацію про ціни на житло з площею як ознакою.

import pandas as pd

df = pd.read_csv('https://codefinity-content-media.s3.eu-west-1.amazonaws.com/b22d1166-efda-45e8-979e-6c3ecfc566fc/houses_simple.csv')
print(df.head())

Наступний крок — призначення змінних і візуалізація набору даних:

import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('https://codefinity-content-media.s3.eu-west-1.amazonaws.com/b22d1166-efda-45e8-979e-6c3ecfc566fc/houses_simple.csv')
X = df['square_feet']
y = df['price']

plt.scatter(X, y, alpha=0.5)
plt.show()

У прикладі з ростом людини було набагато легше уявити собі лінію, яка добре підходить до даних.  

Але зараз наші дані мають значно більшу дисперсію, оскільки цільова змінна сильно залежить від багатьох інших факторів, таких як вік, розташування, інтер'єр тощо.  
У будь-якому випадку, завдання полягає у побудові лінії, яка найкраще підходить до наявних даних; вона покаже тенденцію. Для цього слід використовувати клас `OLS`. Незабаром ми дізнаємося, як додати більше ознак, що покращить прогнозування!

import unittest
import importlib
import numpy as np
import pandas as pd
import statsmodels.api as sm


# Helper for dynamic test names
def _dynamic_test(test_case, condition, ok_msg, fail_msg):
    if condition:
        test_case._testMethodName = ok_msg
        test_case.assertTrue(True)
    else:
        test_case._testMethodName = fail_msg
        test_case.fail(fail_msg)


class TestUserCode(unittest.TestCase):

    def test_y_is_price(self):
        import user_code

        condition = (
            hasattr(user_code, "y") and
            isinstance(user_code.y, pd.Series) and
            user_code.y.name == "price"
        )

        _dynamic_test(
            self,
            condition,
            "The `y` variable correctly contains the `price` column.",
            "Expected `y` to be assigned as df['price']."
        )

    def test_X_tilde_is_add_constant(self):
        import user_code

        condition = (
            hasattr(user_code, "X_tilde") and
            isinstance(user_code.X_tilde, pd.DataFrame) and
            "const" in user_code.X_tilde.columns
        )

        _dynamic_test(
            self,
            condition,
            "The `X_tilde` matrix is created using sm.add_constant.",
            "Expected `X_tilde` to contain a constant column (using sm.add_constant)."
        )

    def test_regression_model_is_ols(self):
        import user_code
        from statsmodels.regression.linear_model import RegressionResultsWrapper

        condition = (
            hasattr(user_code, "regression_model") and
            isinstance(user_code.regression_model, RegressionResultsWrapper)
        )

        _dynamic_test(
            self,
            condition,
            "The model is an instance of OLS and is fitted.",
            "Expected `regression_model` to be a fitted OLS model."
        )

    def test_X_new_tilde_correct(self):
        import user_code

        condition = (
            hasattr(user_code, "X_new_tilde") and
            hasattr(user_code, "X_new") and
            isinstance(user_code.X_new_tilde, np.ndarray) and
            user_code.X_new_tilde.shape == (3, 2)  # 3 samples + constant
        )

        _dynamic_test(
            self,
            condition,
            "The `X_new_tilde` matrix is correctly created with a constant column.",
            "Expected `X_new_tilde` to be a 2-column array created using sm.add_constant."
        )

    def test_y_pred_is_correct_shape(self):
        import user_code
        import numpy as np

        condition = (
            hasattr(user_code, "y_pred") and
            isinstance(user_code.y_pred, np.ndarray) and
            user_code.y_pred.shape == (3,)
        )

        _dynamic_test(
            self,
            condition,
            "The `y_pred` array has the correct shape (3,).",
            "Expected `y_pred` to be a NumPy array with shape (3,)."
        )

    def test_predict_called(self):
        """
        Checks that the predictions are numbers and reasonable (not NaN or None).
        """
        import user_code
        import numpy as np

        try:
            preds = user_code.y_pred
            condition = (
                isinstance(preds, np.ndarray) and
                np.all(~np.isnan(preds)) and
                preds.size == 3
            )
        except Exception:
            condition = False

        _dynamic_test(
            self,
            condition,
            "The predictions are valid numeric outputs.",
            "Expected `y_pred` to contain valid numeric values."
        )


if __name__ == "__main__":
    unittest.main()


test_code.py

Лінійна регресія є ключовим поняттям у прогностичній аналітиці. Вона широко використовується дата-сайентістами, аналітиками даних та статистиками, оскільки її легко побудувати та інтерпретувати, але вона достатньо потужна для багатьох завдань.

Почнемо з найпростішої моделі лінійної регресії. Ви ознайомитеся з основною ідеєю лінійної регресії та дізнаєтеся, як здійснювати прогнозування в Python.

Більшість реальних задач прогнозування включають більше ніж одну ознаку. Ви дізнаєтеся, як працювати з лінійною регресією з декількома ознаками.

Пряма лінія не завжди добре описує дані. Дізнаймося, як побудувати складнішу модель для прогнозування. Саме для цього підходить поліноміальна регресія.

Тепер, коли ви знаєте, як створювати різні моделі лінійної регресії, необхідно визначити спосіб вибору найкращої з них. Це можливо за допомогою метрик. У цьому розділі розглядаються найбільш поширені метрики та труднощі, з якими можна зіткнутися під час їх використання.

Завдання: Прогнозування Цін на Житло

Рішення