Skip to content

Commit

Permalink
Fix Bug in Resid / Predict when Model is Emtpy (#597)
Browse files Browse the repository at this point in the history
* flatten residual

* add tests for empty models

* bump version, build lock file

* udpate tests

* add test typo

* update tests

* fix test typo

* cleanups

* fix resid for weights

* only test predict when fepois without fixef

* fix bug in predict method with WLS
  • Loading branch information
s3alfisc authored Aug 31, 2024
1 parent 5ae158a commit 05f44bd
Show file tree
Hide file tree
Showing 7 changed files with 145 additions and 92 deletions.
Binary file removed .coverage
Binary file not shown.
141 changes: 83 additions & 58 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyfixest/estimation/FixestMulti_.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ def _estimate_all_models(
# special case: sometimes it is useful to fit models as
# "Y ~ 0 | f1 + f2" to demean Y and to use the predict() method
if FIT._X_is_empty:
FIT._u_hat = Y.to_numpy() - Yd_array
FIT._u_hat = Yd_array
else:
FIT.get_fit()

Expand Down
4 changes: 2 additions & 2 deletions pyfixest/estimation/feols_.py
Original file line number Diff line number Diff line change
Expand Up @@ -1465,7 +1465,7 @@ def predict(
)

if newdata is None:
return self._Y_untransformed.to_numpy().flatten() - self._u_hat.flatten()
return self._Y_untransformed.to_numpy().flatten() - self.resid()

newdata = _polars_to_pandas(newdata).reset_index(drop=False)

Expand Down Expand Up @@ -1803,7 +1803,7 @@ def resid(self) -> np.ndarray:
np.ndarray
A np.ndarray with the residuals of the estimated regression model.
"""
return self._u_hat
return self._u_hat.flatten() / np.sqrt(self._weights).flatten()

def ritest(
self,
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pyfixest"
version = "0.24.1"
version = "0.24.2"

description = "Fast high dimensional fixed effect estimation following syntax of the fixest R package. Supports OLS, IV and Poisson regression and a range of inference procedures (HC1-3, CRV1 & CRV3, wild bootstrap, randomization inference, simultaneous CIs, Romano-Wolf's multiple testing correction). Additionally, supports (some of) the regression based new Difference-in-Differences Estimators (Did2s, Linear Projections)."
authors = ["Alexander Fischer <[email protected]>", "Styfen Schär"]
Expand Down
84 changes: 56 additions & 28 deletions tests/test_vs_fixest.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,11 @@
("Y~X1|f2^f3"),
("Y~X1|f1 + f2^f3"),
("Y~X1|f2^f3^f1"),
# empty models
("Y ~ 1 | f1"),
("Y ~ 1 | f1 + f2"),
("Y ~ 0 | f1"),
("Y ~ 0 | f1 + f2"),
]

iv_fmls = [
Expand Down Expand Up @@ -122,15 +127,11 @@ def check_absolute_diff(x1, x2, tol, msg=None):
@pytest.mark.parametrize("error_type", ["2"])
@pytest.mark.parametrize("dropna", [False])
@pytest.mark.parametrize("inference", ["iid", "hetero", {"CRV1": "group_id"}])
# @pytest.mark.parametrize("inference", ["iid", {"CRV1": "group_id"}])
@pytest.mark.parametrize("weights", [None, "weights"])
@pytest.mark.parametrize("f3_type", ["str", "object", "int", "categorical", "float"])
@pytest.mark.parametrize("fml", ols_fmls + ols_but_not_poisson_fml)
@pytest.mark.parametrize("adj", [False, True])
# see here for why not test against cluster_adj = True
# it triggers the N / (N-1) correction, not sure why
# https://github.com/lrberge/fixest/issues/518#issuecomment-2227365516
@pytest.mark.parametrize("cluster_adj", [False])
@pytest.mark.parametrize("cluster_adj", [False, True])
def test_single_fit_feols(
N,
seed,
Expand Down Expand Up @@ -197,9 +198,8 @@ def test_single_fit_feols(
py_confint = mod.confint().xs("X1").values
py_nobs = mod._N
py_vcov = mod._vcov[0, 0]

py_resid = mod._u_hat.flatten() # noqa: F841
# TODO: test residuals
py_resid = mod.resid()
py_predict = mod.predict()

df_X1 = _get_r_df(r_fixest)

Expand All @@ -209,16 +209,32 @@ def test_single_fit_feols(
r_tstat = df_X1["statistic"]
r_confint = df_X1[["conf.low", "conf.high"]].values.astype(np.float64)
r_nobs = int(stats.nobs(r_fixest)[0])
r_resid = r_fixest.rx2("working_residuals") # noqa: F841
r_vcov = stats.vcov(r_fixest)[0, 0]
r_resid = stats.residuals(r_fixest)
r_predict = stats.predict(r_fixest)

if not mod._X_is_empty:
if inference == "iid" and adj and cluster_adj:
check_absolute_diff(py_nobs, r_nobs, 1e-08, "py_nobs != r_nobs")
check_absolute_diff(py_coef, r_coef, 1e-08, "py_coef != r_coef")

check_absolute_diff(py_vcov, r_vcov, 1e-08, "py_vcov != r_vcov")
check_absolute_diff(py_se, r_se, 1e-08, "py_se != r_se")
check_absolute_diff(py_pval, r_pval, 1e-08, "py_pval != r_pval")
check_absolute_diff(py_tstat, r_tstat, 1e-07, "py_tstat != r_tstat")
check_absolute_diff(py_confint, r_confint, 1e-08, "py_confint != r_confint")

# residuals invariant so to vcov type
if inference == "iid" and adj and not cluster_adj:
check_absolute_diff(
(py_resid)[0:5], (r_resid)[0:5], 1e-07, "py_resid != r_resid"
)
check_absolute_diff(
py_predict[0:5], r_predict[0:5], 1e-07, "py_predict != r_predict"
)

check_absolute_diff(py_nobs, r_nobs, 1e-08, "py_nobs != r_nobs")
check_absolute_diff(py_coef, r_coef, 1e-08, "py_coef != r_coef")
check_absolute_diff(py_vcov, r_vcov, 1e-08, "py_vcov != r_vcov")
check_absolute_diff(py_se, r_se, 1e-08, "py_se != r_se")
check_absolute_diff(py_pval, r_pval, 1e-08, "py_pval != r_pval")
check_absolute_diff(py_tstat, r_tstat, 1e-07, "py_tstat != r_tstat")
check_absolute_diff(py_confint, r_confint, 1e-08, "py_confint != r_confint")
if mod._X_is_empty:
assert mod._beta_hat.size == 0

if not weights:
py_r2 = mod._r2
Expand Down Expand Up @@ -296,9 +312,7 @@ def test_single_fit_fepois(
py_nobs = mod._N
py_vcov = mod._vcov[0, 0]
py_deviance = mod.deviance

py_resid = mod._u_hat.flatten() # noqa: F841
# TODO: test residuals
py_resid = mod.resid()

df_X1 = _get_r_df(r_fixest)

Expand All @@ -308,19 +322,29 @@ def test_single_fit_fepois(
r_tstat = df_X1["statistic"]
r_confint = df_X1[["conf.low", "conf.high"]].values.astype(np.float64)
r_nobs = int(stats.nobs(r_fixest)[0])
r_resid = r_fixest.rx2("working_residuals") # noqa: F841
r_resid = stats.residuals(r_fixest)
r_vcov = stats.vcov(r_fixest)[0, 0]
r_deviance = r_fixest.rx2("deviance")

check_absolute_diff(py_nobs, r_nobs, 1e-08, "py_nobs != r_nobs")
check_absolute_diff(py_coef, r_coef, 1e-08, "py_coef != r_coef")
if inference == "iid" and adj and cluster_adj:
check_absolute_diff(py_nobs, r_nobs, 1e-08, "py_nobs != r_nobs")
check_absolute_diff(py_coef, r_coef, 1e-08, "py_coef != r_coef")
check_absolute_diff((py_resid)[0:5], (r_resid)[0:5], 1e-07, "py_coef != r_coef")

check_absolute_diff(py_vcov, r_vcov, 1e-06, "py_vcov != r_vcov")
check_absolute_diff(py_se, r_se, 1e-06, "py_se != r_se")
check_absolute_diff(py_pval, r_pval, 1e-06, "py_pval != r_pval")
check_absolute_diff(py_tstat, r_tstat, 1e-06, "py_tstat != r_tstat")
check_absolute_diff(py_confint, r_confint, 1e-06, "py_confint != r_confint")
check_absolute_diff(py_deviance, r_deviance, 1e-08, "py_deviance != r_deviance")

if not mod._has_fixef:
py_predict = mod.predict()
r_predict = stats.predict(r_fixest)
check_absolute_diff(
py_predict[0:5], r_predict[0:5], 1e-07, "py_predict != r_predict"
)


@pytest.mark.parametrize("N", [1000])
@pytest.mark.parametrize("seed", [76540251])
Expand Down Expand Up @@ -398,9 +422,8 @@ def test_single_fit_iv(
py_confint = mod.confint().xs("X1").values
py_nobs = mod._N
py_vcov = mod._vcov[0, 0]

py_resid = mod._u_hat.flatten() # noqa: F841
# TODO: test residuals
py_resid = mod.resid()
py_predict = mod.predict()

df_X1 = _get_r_df(r_fixest)

Expand All @@ -410,11 +433,16 @@ def test_single_fit_iv(
r_tstat = df_X1["statistic"]
r_confint = df_X1[["conf.low", "conf.high"]].values.astype(np.float64)
r_nobs = int(stats.nobs(r_fixest)[0])
r_resid = r_fixest.rx2("working_residuals") # noqa: F841
r_resid = stats.resid(r_fixest)
r_predict = stats.predict(r_fixest)
r_vcov = stats.vcov(r_fixest)[0, 0]

check_absolute_diff(py_nobs, r_nobs, 1e-08, "py_nobs != r_nobs")
check_absolute_diff(py_coef, r_coef, 1e-08, "py_coef != r_coef")
if inference == "iid" and adj and cluster_adj:
check_absolute_diff(py_nobs, r_nobs, 1e-08, "py_nobs != r_nobs")
check_absolute_diff(py_coef, r_coef, 1e-08, "py_coef != r_coef")
check_absolute_diff(py_predict[0:5], r_predict[0:5], 1e-07, "py_coef != r_coef")
check_absolute_diff((py_resid)[0:5], (r_resid)[0:5], 1e-07, "py_coef != r_coef")

check_absolute_diff(py_vcov, r_vcov, 1e-07, "py_vcov != r_vcov")
check_absolute_diff(py_se, r_se, 1e-07, "py_se != r_se")
check_absolute_diff(py_pval, r_pval, 1e-06, "py_pval != r_pval")
Expand Down
4 changes: 2 additions & 2 deletions tests/texfiles/test.tex
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
\midrule
f1 & x & x \\
\midrule
$R^2$ & 0.489 & - \\
S.E. type & by: f1 & by: f1+f2 \\
Observations & 1994 & 997 \\
S.E. type & by: f1 & by: f1+f2 \\
$R^2$ & 0.489 & - \\
\bottomrule
\end{tabular}
\footnotesize Significance levels: $*$ p $<$ 0.05, $**$ p $<$ 0.01, $***$ p $<$ 0.001. Format of coefficient cell: Coefficient
Expand Down

0 comments on commit 05f44bd

Please sign in to comment.