-
Notifications
You must be signed in to change notification settings - Fork 234
/
Copy pathlecture2_mle.tex
1300 lines (1137 loc) · 49.9 KB
/
lecture2_mle.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
\documentclass[aspectratio=169]{beamer}
\usetheme{metropolis}
%\usepackage{beamerthemesplit}
%\beamertemplatenavigationsymbolsempty
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{amssymb}
\usepackage{latexsym}
\usepackage{graphicx}
\usepackage{fancybox}
\usepackage{dsfont}
\usepackage{multirow}
\usepackage{multicol}
\usepackage{booktabs}
\usepackage{dcolumn}
\usepackage{soul}
\usepackage[cache=false]{minted}
\usepackage{MnSymbol}
\usepackage{stmaryrd}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
\newcommand{\X}{\mathtt{X}}
\newcommand{\Y}{\mathtt{Y}}
%\newcommand{\R}{\mathbb{R}}
%\newcommand{\E}{\mathbb{E}}
%\newcommand{\V}{\mathbb{V}}
\newcommand{\p}{\mathbb{P}}
\newcommand*\df{\mathop{}\!\mathrm{d}}
\newcommand{\del}{\partial}
% imports
\usepackage{xargs}
\usepackage{xpatch}
\usepackage{etoolbox}
\usepackage{pdflscape}
\usepackage{booktabs}
\usepackage{threeparttable}
\usepackage[skip=0.2\baselineskip]{caption}
% command for inputting raw latex
\makeatletter
\newcommand\primitiveinput[1]{\@@input #1 }
\makeatother
% common table command
\newcommandx{\tablecontent}[4]{
\begin{threeparttable}[!ht]
\centering
\caption{#3}
\vspace{-1em}
\footnotesize
\begin{tabular}{#1}
\primitiveinput{../tables/#2.tex}
\end{tabular}
\vspace{-0.2em}
\begin{tablenotes}[flushleft]
#4
\end{tablenotes}
\end{threeparttable}
}
% \usepackage{slashbox}
\title{Lecture 2: Maximum Likelihood and Friends}
\author{Chris Conlon }
\institute{NYU Stern }
\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
\newcommand{\R}{\mathbb{R}}
\newcommand{\E}{\mathbb{E}}
\newcommand{\V}{\mathbb{V}}
\newcommand{\ol}{\overline}
%\newcommand{\ul}{\underline}
\newcommand{\pp}{{\prime \prime}}
\newcommand{\ppp}{{\prime \prime \prime}}
\newcommand{\policy}{\gamma}
\newcommand{\fp}{\frame[plain]}
\date{\today}
\begin{document}
\maketitle
\begin{frame}{Introduction}
Consider a linear regression with $\varepsilon_i | X_i \sim N(0,\sigma^2)$
\begin{align*}
Y_{it} &= X_{i}'\beta_i + \varepsilon_{i}
\end{align*}
We've discussed the \alert{least squares estimator}:
\begin{align*}
\widehat{\beta}_{ols} &= \arg \min_{\beta} \sum_{i=1}^N (Y_i - X_i' \beta)^2\\
\widehat{\beta}_{ols} &= (\mathbf{X}'\mathbf{X})^{-1} \mathbf{X}' \mathbf{Y}
\end{align*}
\end{frame}
\begin{frame}{Review: What is a Likelihood?}
Suppose we write down the joint distribution of our data $(y_i,x_i)$ for $i=1,\ldots,n$.
\begin{align*}
Pr(y_1,\ldots,y_n, x_1,\ldots,x_n | \theta)
\end{align*}
If $(y_i,x_i)$ are I.I.D then we can write this as:
\begin{align*}
Pr(y_1,\ldots,y_n, x_1,\ldots,x_n | \theta) = \prod_{i=1}^N Pr(y_i, x_i | \theta) \propto \prod_{i=1}^N Pr(y_i | x_i , \theta)=L( \mathbf{y}| \mathbf{x} ,\theta )
\end{align*}
We call this $L( \mathbf{y}| \mathbf{x} ,\theta )$ the \alert{likelihood} of the observed data.
\end{frame}
\begin{frame}{MLE: Example}
If we know the distribution of $\varepsilon_i$ we can construct a \alert{maximum likelihood estimator}
\begin{align*}
(\widehat{\beta}_{MLE},\widehat{\sigma}^2_{MLE}) &= \arg \min_{\beta,\sigma^2} L(\beta,\sigma^2)
\end{align*}
Where
\begin{align*}
L(\beta,\sigma^2) &= \prod_{i=1}^N p(y_i | x_i,\beta,\sigma^2) \\
&= \prod_{i=1}^N \frac{1}{\sqrt{2 \pi \sigma^2}} \exp \left[-\frac{1}{2\sigma^2}(Y_i - X_i' \beta)^2 \right]\\
\ell(\beta,\sigma^2) &= \sum_{i=1}^N -\frac{1}{2} \ln (2 \pi \sigma^2) - \frac{1}{2 \sigma^2} \sum_{i=1}^N(Y_i - X_i' \beta)^2
\end{align*}
\end{frame}
\begin{frame}{MLE: FOC's}
Take the FOC's
\begin{align*}
\ell(\beta,\sigma^2) &= -\frac{N}{2} \ln (2 \pi \sigma^2) - \frac{1}{2 \sigma^2} \sum_{i=1}^N(Y_i - X_i' \beta)^2
\end{align*}
Where
\begin{align*}
\frac{ \partial \ell(\beta,\sigma^2) }{\partial \beta}&= \frac{1}{ \sigma^2}\sum_{i=1}^N(Y_i - X_i' \beta) = 0 \rightarrow \widehat{\beta}_{MLE}= \widehat{\beta}_{OLS}\\
\frac{ \partial \ell(\beta,\sigma^2) }{\partial \sigma^2}&= -N \frac{1}{2 \sigma^2} - \frac{1}{2 \sigma^4} \sum_{i=1}^N(Y_i - X_i' \beta)^2 = 0 \\
\sigma^2_{MLE} &= \frac{1}{N} \sum_{i=1}^N (Y_i - X_i' \beta)^2
\end{align*}
Note: the unbiased estimator uses $\frac{1}{N-K-1}$.
\end{frame}
\begin{frame}{MLE: General Case}
\begin{enumerate}
\item Start with the \alert{joint density of the data} $Z_1,\ldots,Z_N$ with density $f_Z(z,\theta)$
\item Construct the likelhood function of the sample $z_1,\ldots,z_n$
\begin{align*}
L(\mathbf{z} | \theta) = \prod_{i=1}^N f_Z(z_i,\theta)
\end{align*}
\item Construct the \alert{log likelihood} (this has the same $\arg \max$)
\begin{align*}
\ell(\mathbf{z} | \theta) = \sum_{i=1}^N \ln f_Z(z_i,\theta)
\end{align*}
\item Take the FOC's to find $\widehat{\theta}_{MLE}$
\begin{align*}
\theta : \frac{\partial \ell(\theta)}{\partial \theta} =0
\end{align*}
\end{enumerate}
\end{frame}
\begin{frame}{MLE in Detail}
Basic Setup: we know $F(z|\theta_0)$ but not $\theta_0$. We know $\theta_0 \in \Theta \subset \mathbb{R}^K$.
\begin{itemize}
\item Begin with a sample of $z_i$ from $i=1,\ldots,N$ which are I.I.D. with CDF $F(z|\theta_0)$.
\item The MLE chooses
\begin{align*}
\widehat{\theta}_{MLE} = \arg \max_{\theta} \ell(\theta) = \arg \max_{\theta} \sum_{i=1}^N \ln f_Z(z_i,\theta)
\end{align*}
\end{itemize}
\end{frame}
\begin{frame}{MLE: Technical Details}
\begin{enumerate}
\item Consistency. When is it true that for $\epsilon>0$?
\begin{align*}
\lim _ { N \rightarrow \infty } \operatorname { Pr } \left( \left\| \hat { \theta } _ { m l e } - \theta _ { 0 } \right\| > \varepsilon \right) = 0
\end{align*}
\item Asymptotic Normality. What else do we need to show?
\begin{align*}
\sqrt { N } \left( \hat { \theta } _ { m l e } - \theta _ { 0 } \right) \stackrel { d } { \longrightarrow } \mathcal { N } \left( 0 , - \left[ E \frac { \partial ^ { 2 } } { \partial \theta \partial \theta ^ { \prime } } \left( Z _ { i } , \theta _ { 0 } \right) \right] ^ { - 1 } \right)
\end{align*}
\item Optimization. How to we obtain $\widehat{\theta}_{MLE}$ anyway?
\end{enumerate}
\end{frame}
\begin{frame}{MLE: Example \# 1}
\begin{itemize}
\item $Z_i \sim N(\theta_0,1)$ and $\Theta = (-\infty,\infty)$. In this case:
\begin{align*}
\ell ( \theta ) = - N \cdot \ln ( 2 \pi ) - \sum _ { i = 1 } ^ { N } \left( z _ { i } - \theta \right) ^ { 2 } / 2
\end{align*}
\item MLE is $\widehat{\theta}_{MLE}=\overline{z}$ which is consistent for $\theta_0 = E[Z_i]$
\item Asymptotic distribution is $\sqrt{N} ( \overline{z}-\theta_0) \sim N(0,1)$.
\item Calculating mean is easy!
\end{itemize}
\end{frame}
\begin{frame}{MLE: Example \# 2}
\begin{itemize}
\item $Z_i = (Y_i, X_i)$--- $X_i$ has finite mean and variance (but arbitrary distribution)
\item $(Y_i | X_i =x) \sim N(x' \beta_0, \sigma_0^2)$
\begin{align*}
\widehat{\beta}_{MLE} &= (X'X)^{-1} X'Y\\
\widehat{\sigma}^2_{MLE} &= \frac{1}{N} \sum (y_i - x_i \widehat{\beta}_{MLE})^2
\end{align*}
\item We already have shown consistency and AN for linear regression with normally distributed errors...
\end{itemize}
\end{frame}
\begin{frame}{MLE: Example \# 3}
\begin{itemize}
\item $Z_i = (Y_i, X_i)$--- $X_i$ has finite mean and variance (but arbitrary distribution)
\item $Pr(Y_i=1 | X_i =x) = \frac{e^{x' \theta_0}}{1+ e^{x'\theta_0}}$
\item Solution is the \alert{logit} model.
\item No simple MLE solution, establishing properties is not obvious...
\end{itemize}
\end{frame}
\begin{frame}{Jensen's Inequality}
Let $g(z)$ be a convex function. Then $\mathbb { E }[g(Z)] \geq g(\mathbb { E }[Z])$, with equality only in the case of a linear function.
\end{frame}
\begin{frame}{More Technical Details}
Define $Y$ as the ratio of the density at $\theta$ to the density at the true value $\theta_0$ both evaluated at $Z$
\begin{align*}
Y = \frac{f_Z(Z;\theta)}{f_Z(Z;\theta_0)}
\end{align*}
\begin{itemize}
\item Let $g(a) = -\ln(a)$ so that $g'(a) = \frac{-1}{a}$ and $g''(a) =\frac{1}{a^2}$.
\item Then by \alert{Jensen's Inequality} $\mathbb{E}[- \ln Y] \geq - \ln \mathbb{E}[Y]$.
\item This gives us
\begin{align*}
\mathbb { E }_z \left[ - \ln \left( \frac { f _ { Z } ( Z ; \theta ) } { f _ { Z } \left( Z ; \theta _ { 0 } \right) } \right) \right] \geq - \ln \left( \mathbb { E }_z \left[ \frac { f _ { Z } ( Z ; \theta ) } { f _ { Z } \left( Z ; \theta _ { 0 } \right) } \right] \right)
\end{align*}
\item The RHS is
\begin{align*}
\mathbb { E }_z \left[ \frac { f _ { Z } ( Z ; \theta ) } { f _ { Z } \left( Z ; \theta _ { 0 } \right) } \right] = \int \frac { f _ { Z } ( z ; \theta ) } { f _ { Z } \left( z ; \theta _ { 0 } \right) } \cdot f _ { Z } \left( z ; \theta _ { 0 } \right) d z = \int f _ { Z } ( z ; \theta ) d z = 1
\end{align*}
\end{itemize}
\end{frame}
\begin{frame}{More Technical Details}
Because $\log(1)=0$ this implies:
\begin{align*}
\mathbb { E }_z \left[ - \ln \left( \frac { f _ { Z } ( Z ; \theta ) } { f _ { Z } \left( Z ; \theta _ { 0 } \right) } \right) \right] \geq 0
\end{align*}
Therefore
\begin{align*}
- \mathbb { E } \left[ \ln f _ { Z } ( Z ; \theta ) \right] &+ \mathbb { E } \left[ \ln f _ { Z } \left( Z ; \theta _ { 0 } \right) \right] \geq 0\\
\mathbb { E } \left[ \ln f _ { Z } \left( Z ; \theta _ { 0 } \right) \right] &\geq \mathbb { E } \left[ \ln f _ { Z } ( Z ; \theta ) \right]
\end{align*}
\begin{itemize}
\item We maximize the expected value of the log likelihood at the true value of $\theta$!
\item Helpful to work with $\mathbb{E}[\log f(z; \theta)]$ sometimes.
\end{itemize}
\end{frame}
\begin{frame}{Information Matrix Equality}
We can relate the \alert{Fisher Information} to the Hessian of the log-likelihood
\begin{align*}
\mathcal { I } \left( \theta _ { 0 } \right) = - \mathbb { E } \left[ \frac { \partial ^ { 2 } \ln f } { \partial \theta \partial \theta } \left( z ; \theta _ { 0 } \right) \right]
= \mathbb { E } \left[ \frac { \partial \ln f } { \partial \theta } \left( z ; \theta _ { 0 } \right) \times \frac { \partial \ln f } { \partial \theta } \left( z ; \theta _ { 0 } \right)' \right]
\end{align*}
\begin{itemize}
\item This is sometimes known as the \alert{outer product of scores}.
\item This matrix is \alert{negative definite}
\item Recall that $ \mathbb { E } \left[ \frac { \partial \ln f } { \partial \theta } \left( z ; \theta _ { 0 } \right) \right]\approx 0$ at the maximum
\end{itemize}
\end{frame}
\begin{frame}{Proof}
\begin{align*}
1 = \int _ { z } f _ { Z } ( z ; \theta ) d z \Rightarrow 0 = \frac { \partial } { \partial \theta } \int _ { z } f _ { Z } ( z ; \theta ) d z
\end{align*}
With some regularity conditions
\begin{align*}
0 = \int _ { z } \frac { \partial f _ { Z } } { \partial \theta } ( z ; \theta ) d z = \underbrace{\int _ { z } \frac { \partial \ln f _ { Z } } { \partial \theta } ( z ; \theta ) \cdot f _ { Z } ( z ; \theta ) d z}_{\mathbb { E } \left[ \frac { \partial \ln f _ { Z } } { \partial \theta } \left( z ; \theta _ { 0 } \right) \right]}
\end{align*}
\begin{itemize}
\item This gives us the FOC we needed.
\item Can get information identity with another set of derivatives.
\end{itemize}
\end{frame}
\begin{frame}{The Cramer-Rao Bound}
We can relate the \alert{Fisher Information} to the Hessian of the log-likelihood
\begin{align*}
\mathcal { I } ( \theta ) = - \mathbb { E } \left[ \frac { \partial ^ { 2 } \ln f } { \partial \theta \partial \theta ^ { \prime } } ( Z | \theta ) \right]
\end{align*}
It turns out this provides a bound on the variance
\begin{align*}
\operatorname { Var } ( \hat { \theta } ( Z ) ) \geq \mathcal { I } \left( \theta _ { 0 } \right) ^ { - 1 }
\end{align*}
Because we can't do better than Fisher Information we know that MLE is most efficient estimator!
\end{frame}
\begin{frame}{MLE: Discussion}
Tradeoffs
\begin{itemize}
\item How does this compare to GM Theorem?
\item If MLE is most efficient estimate, why ever use something else?
\end{itemize}
\end{frame}
\begin{frame}{Exponential Example}
\begin{align*}
f _ { Y | X } ( y | x , \beta _ { 0 } ) = { e } ^ { x ^ { \prime } \beta _ { 0 } } \exp \left( - y { e } ^ { x ^ { \prime } \beta _ { 0 } } \right)
\end{align*}
With log likelihood
\begin{align*}
\ell( \beta ) = \sum _ { i = 1 } ^ { N } \ln f _ { Y | X } \left( y _ { i } | x _ { i } , \beta \right) = \sum _ { i = 1 } ^ { N } X _ { i } ^ { \prime } \beta - y _ { i } \cdot \exp \left( x _ { i } ^ { \prime } \beta \right)
\end{align*}
And Score, Hessian, and Information Matrix:
\begin{align*}
\mathcal { S }_i ( y_i, x_i , \beta ) &= x_i ^ { \prime } \left( 1 - y_i \exp \left( x_i ^ { \prime } \beta \right) \right)\\
\mathcal { H }_i ( y_i , x_i , \beta ) &= - y_i x_i x_i ^ { \prime } \exp \left( x_i ^ { \prime } \beta \right)\\
% WHERE DOES THIS COME FROM
\mathcal { I } \left( \beta _ { 0 } \right) &= \mathbb { E } \left[ Y X X ^ { \prime } \exp \left( X ^ { \prime } \beta _ { 0 } \right) \right] = \mathbb { E } \left[ X X ^ { \prime } \right]
\end{align*}
\end{frame}
\section*{Computing Maximum Likelihood Estimators}
\begin{frame}{Newton's Method for Root Finding}
Consider the Taylor series for $f(x)$ approximated around $f(x_0)$:
\begin{align*}
f(x) \approx f(x_0) + f'(x_0) \cdot (x-x_0) + f''(x_0) \cdot (x-x_0)^2 + o_p(3)
\end{align*}
Suppose we wanted to find a \alert{root} of the equation where $f(x^{*})=0$ and solve for $x$:
\begin{align*}
0 &= f(x_0) + f'(x_0) \cdot (x-x_0) \\
x_1 &= x_0-\frac{f(x_0)}{f'(x_0)}
\end{align*}
This gives us an \alert{iterative} scheme to find $x^{*}$:
\begin{enumerate}
\item Start with some $x_k$. Calculate $f(x_k),f'(x_k)$
\item Update using $x_{k+1} = x_k - \frac{f(x_k)}{f'(x_k)} $
\item Stop when $|x_{k+1}-x_{k}| < \epsilon_{tol}$.
\end{enumerate}
\end{frame}
\begin{frame}{Newton-Raphson for Minimization}
We can re-write \alert{optimization} as \alert{root finding};
\begin{itemize}
\item We want to know $\hat{\theta} = \arg \max_{\theta} \ell(\theta)$.
\item Construct the FOCs $\frac{\partial \ell}{\partial \theta}=0 \rightarrow$ and find the zeros.
\item How? using Newton's method! Set $f(\theta) = \frac{\partial \ell}{\partial \theta}$
\end{itemize}
\begin{align*}
\theta_{k+1} &= \theta_k - \left[ \frac{\partial^2 \ell}{\partial \theta^2}(\theta_k) \right]^{-1} \cdot \frac{\partial \ell}{\partial \theta}(\theta_k)
\end{align*}
The SOC is that $ \frac{\partial^2 \ell}{\partial \theta^2} >0$. Ideally at all $\theta_k$.\\
This is all for a \alert{single variable} but the \alert{multivariate} version is basically the same.
\end{frame}
\begin{frame}{Newton's Method: Multivariate}
Start with the objective $Q(\theta) = - \ell(\theta)$:
\begin{itemize}
\item Approximate $Q(\theta)$ around some initial guess $\theta_0$ with a quadratic function
\item Minimize the quadratic function (because that is easy) call that $\theta_1$
\item Update the approximation and repeat.
\begin{align*}
\theta_{k+1} = \theta_k - \left[ \frac{\partial^2 Q}{\partial \theta \partial \theta'} \right]^{-1}\frac{\partial Q}{\partial \theta}(\theta_k)
\end{align*}
\item The equivalent SOC is that the {Hessian Matrix} is \alert{positive semi-definite} (ideally at all $\theta$).
\item In that case the problem is \alert{globally convex} and has a \alert{unique maximum} that is easy to find.
\end{itemize}
\end{frame}
\begin{frame}{Newton's Method}
We can generalize to Quasi-Newton methods:
\begin{align*}
\theta_{k+1} = \theta_k - \lambda_k \underbrace{\left[ \frac{\partial^2 Q}{\partial \theta \partial \theta'} \right]^{-1}}_{A_k} \frac{\partial Q}{\partial \theta}(\theta_k)
\end{align*}
Two Choices:
\begin{itemize}
\item Step length $\lambda_k$
\item Step direction $d_k=A_k \frac{\partial Q}{\partial \theta}(\theta_k)$
\item Often rescale the direction to be unit length $\frac{d_k}{\norm{d_k}}$.
\item If we use $A_k$ as the true Hessian and $\lambda_k=1$ this is a \alert{full Newton step}.
\end{itemize}
\end{frame}
\begin{frame}{Newton's Method: Alternatives}
Choices for $A_k$
\begin{itemize}
\item $A_k= I_{k}$ (Identity) is known as \alert{gradient descent} or \alert{steepest descent}
\item BHHH. Specific to MLE. Exploits the \alert{Fisher Information}.
\begin{align*}
A _ { k }
&= \left[ \frac { 1 } { N } \sum _ { i = 1 } ^ { N } \frac { \partial \ln f } { \partial \theta } \left( \theta _ { k } \right) \frac { \partial \ln f } { \partial \theta ^ { \prime } } \left( \theta _ { k } \right) \right] ^ { - 1 }\\
&=- \mathbb { E } \left[ \frac { \partial ^ { 2 } \ln f } { \partial \theta \partial \theta ^ { \prime } } \left( Z , \theta ^ { * } \right) \right]
= \mathbb { E } \left[ \frac { \partial \ln f } { \partial \theta } \left( Z , \theta ^ { * } \right) \frac { \partial \ln f } { \partial \theta ^ { \prime } } \left( Z , \theta ^ { * } \right) \right]
\end{align*}
\item Alternatives \alert{SR1} and \alert{DFP} rely on an initial estimate of the Hessian matrix and then approximate an update to $A_k$.
\item Usually updating the Hessian is the costly step.
\item Non invertible Hessians are bad news.
\end{itemize}
\end{frame}
\section{Extended Example: Binary Choice}
\begin{frame}
\frametitle{Binary Choice: Overview}
Many problems we are interested in look at discrete rather than continuous outcomes:
\begin{itemize}
\item Entering a Market/Opening a Store
\item Working or a not
\item Being married or not
\item Exporting to another country or not
\item Going to college or not
\item Smoking or not
\item etc.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Simplest Example: Flipping a Coin}
Suppose we flip a coin which is yields heads ($Y=1$) and tails ($Y=0$). We want to estimate the probability $p$ of heads:
\begin{eqnarray*}
Y_i =
\begin{cases}
1 \mbox{ with probability } p \\
0 \mbox{ with probability } 1-p
\end{cases}
\end{eqnarray*}
We see some data $Y_1,\ldots,Y_N$ which are (i.i.d.)\\
\vspace{0.2cm}
We know that $Y_i \sim Bernoulli(p)$.
\end{frame}
\begin{frame}{Simplest Example: Flipping a Coin}
We can write the likelihood of $N$ Bernoulli trials as
$$Pr(Y_1 = y_1, Y_2=y_2,\ldots,Y _N=y_N ) = f(y_1,y_2,\ldots,y_N | p ) $$
\begin{eqnarray*}
&=& \prod_{i=1}^N p^{y_i} (1-p)^{1-y_i}\\
&=& p^{\sum_{i=1}^N y_i} (1-p)^{N-\sum{i=1}^N y_i}
\end{eqnarray*}
And then take logs to get the \alert{log likelihood}:
\begin{eqnarray*}
\ln f(y_1,y_2,\ldots,y_N | p ) &=& \left( \sum_{i=1}^N y_i \right) \ln p + \left(N-\sum_{i=1}^N y_i \right) (1-p)
\end{eqnarray*}
\end{frame}
\begin{frame}{Simplest Example: Flipping a Coin}
Differentiate the log-likelihood to find the maximum:
\begin{eqnarray*}
\ln f(y_1,y_2,\ldots,y_N | p ) &=& \left( \sum_{i=1}^N y_i \right) \ln p + \left(N-\sum_{i=1}^N y_i \right) \ln(1-p)\\
\rightarrow 0&=& \frac{1}{\hat{p}} \left( \sum_{i=1}^N y_i \right) + \frac{-1}{1-\hat{p}} \left(N-\sum_{i=1}^N y_i \right) \\
\frac{\hat{p}}{1-\hat{p}} &=& \frac{\sum_{i=1}^N y_i }{N- \sum_{i=1}^N y_i } = \frac{\overline{Y}}{1-\overline{Y}} \\
\hat{p}^{MLE} &=& \overline{Y}
\end{eqnarray*}
That was a lot of work to get the obvious answer: \alert{fraction of heads}.
\end{frame}
\begin{frame}{More Complicated Example: Adding Covariates}
We probably are interested in more complicated cases where $p$ is not the same for all observations but rather $p(X)$ depends on some covariates. Here is an example from the Boston HMDA Dataset:
\begin{itemize}
\item 2380 observations from 1990 in the greater Boston area.
\item Data on: individual Characteristics, Property Characteristics, Loan Denial/Acceptance (1/0).
\item Mortgage Application process circa 1990-1991:
\begin{itemize}
\item Go to bank
\item Fill out an application (personal+financial info)
\item Meet with loan officer
\item Loan officer makes decision
\begin{itemize}
\item Legally in race blind way (discrimination is illegal but rampant)
\item Wants to maximize profits (ie: loan to people who don't end up defeaulting!)
\end{itemize}
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}{Loan Officer's Decision}
Financial Variables:
\begin{itemize}
\item $P/I$ ratio
\item housing expense to income ratio
\item loan-to-value ratio
\item personal credit history (FICO score, etc.)
\item Probably some nonlinearity:
\begin{itemize}
\item Very high $LTV > 80\%$ or $>95\%$ is a bad sign (strategic defaults?)
\item Credit Score Thresholds
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}{Loan Officer's Decision}
Goal $Pr(Deny=1 | black, X)$\\
\begin{itemize}
\item Lots of potential \alert{omitted variables} which are correlated with race
\begin{itemize}
\item Wealth, type of employment
\item family status
\item credit history
\item zip code of property
\end{itemize}
\item Lots or \alert{redlining} cases hinge on whether or not black applicants were treated in a discriminatory way.
\end{itemize}
\end{frame}
\begin{frame}
\begin{center}
\includegraphics[width=3.25in]{resources/hmda1.pdf}\\
\includegraphics[width=3.25in]{resources/hmda2.pdf}
\end{center}
\end{frame}
\begin{frame}{Linear Probability Model}
First thing we might try is OLS
\begin{eqnarray*}
Y_i = \beta_0 + \beta_1 X_i + \varepsilon_i
\end{eqnarray*}
\begin{itemize}
\item What does $\beta_1$ mean when $Y$ is binary? Is $\beta_1 = \frac{\Delta Y}{\Delta X}$?
\item What does the line $\beta_0 + \beta_1 X$ when $Y$ is binary?
\item What does the predicted value $\hat{Y}$ mean when $Y$ is binary? Does $\hat{Y} = 0.26$ mean that someone gets approved or denied for a loan?
\end{itemize}
\end{frame}
\begin{frame}{Linear Probability Model}
OLS is called the \alert{linear probability model}
\begin{eqnarray*}
Y_i = \beta_0 + \beta_1 X_i + \varepsilon_i
\end{eqnarray*}
because:
\begin{eqnarray*}
E[Y | X] &=& 1 \times Pr(Y=1 | X) + 0 \times Pr(Y=0 | X) \\
Pr(Y=1 | X) &=& \beta_0 + \beta_1 X_i + \varepsilon_i
\end{eqnarray*}
The predicted value is a \alert{probability} and
\begin{eqnarray*}
\beta_1 = \frac{Pr(Y=1 | X =x+\Delta x) - Pr(Y=1 | X=x)}{\Delta x}
\end{eqnarray*}
So $\beta_1$ represents the average change in probability that $Y=1$ for a unit change in $X$.
\end{frame}
\begin{frame}
\begin{center}
\includegraphics[width=5in]{resources/lpm.pdf}\\
\end{center}
\end{frame}
\begin{frame}{That didn't look great}
\begin{itemize}
\item Is the marginal effect $\beta_1$ actually constant or does it depend on $X$?
\item Sometimes we predict $\hat{Y} >1$ or $\hat{Y} <0$. What does that even mean? Is it still a probability?
\item Fit in the middle seems not so great -- what does $\hat{Y} = 0.5$ mean?
\end{itemize}
\end{frame}
\begin{frame}{Results}
\begin{eqnarray*}
\widehat{deny_i}= -.091& + .559 \cdot \text{P/I ratio} + &.177 \cdot \text{black} \\
(0.32)&( .098) & (.025)\\
\end{eqnarray*}
Marginal Effects:
\begin{itemize}
\item Increasing $P/I$ from $0.3 \rightarrow 0.4$ increases probabilty of denial by $5.59$ percentage points. (True at all level of $P/I$).
\item At all $P/I$ levels blacks are $17.7$ percentage points more likely to be denied.
\item But still some omitted factors.
\item True effects are likely to be \alert{nonlinear} can we add polynomials in $P/I$? Dummies for different levels?
\end{itemize}
\end{frame}
\begin{frame}{Moving Away from LPM}
Problem with the LPM/OLS is that it requires that \alert{marginal effects are constant} or that probability can be written as linear function of parameters.
\begin{eqnarray*}
Pr(Y=1 | X) = \beta_0 + \beta_1 X+ \epsilon
\end{eqnarray*}
Some desirable properties:
\begin{itemize}
\item Can we restrict our predictions to $[0,1]$?
\item Can we preserve \alert{monotonicity} so that $Pr(Y=1| X)$ is increasing in $X$ for $\beta_1 >0$?
\item Some other properties (continuity, etc.) \pause
\item Want a function $F(z): (-\infty,\infty) \rightarrow [0,1]$.
\item What function will work?
\end{itemize}
\end{frame}
\begin{frame}
\begin{center}
\includegraphics[width=5in]{resources/probit.pdf}
\end{center}
\end{frame}
\begin{frame}{Choosing a transformation}
\begin{eqnarray*}
Pr(Y=1 | X) = F(\beta_0 + \beta_1 X)
\end{eqnarray*}
\begin{itemize}
\item One $F(\cdot)$ that works is $\Phi(z)$ the normal CDF. This is the \alert{probit} model.
\begin{itemize}
\item Actually any CDF would work but the normal is convenient.
\end{itemize}
\item One $F(\cdot)$ that works is $\frac{e^z}{1+ e^z}=\frac{1}{1+e^{-z}}$ the logistic function . This is the \alert{logit} model.
\item Both of these give `S'-shaped curves.
\item The LPM is $F(\cdot)$ is the \alert{identity function} (which doesn't satisfy my $[0,1]$ property).
\item This $F(\cdot)$ is often called a \alert{link function}. Why?
\end{itemize}
\end{frame}
\begin{frame}{Why use the normal CDF?}
Has some nice properties:
\begin{itemize}
\item Gives us more of the `S' shape
\item $Pr(Y=1|X)$ is increasing in $X$ if $\beta_1>0$.
\item $Pr(Y=1|X) \in [0,1]$ for all $X$
\item Easy to use -- you can look up or use computer for normal CDF.
\item Relatively straightforward interpretation
\begin{itemize}
\item $Z=\beta_0 + \beta_1 X$ is the $z$-value.
\item $\beta_1$ is the change in the $z$-value for a change in $X_1$.
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}
\begin{center}
\includegraphics[width=2.5in]{resources/hmda3.pdf}\\
\includegraphics[width=2.5in]{resources/hmda3a.pdf}\\
\end{center}
\end{frame}
\begin{frame}[fragile]{Probit in R}
\footnotesize
\begin{semiverbatim}
bm1 <- glm(deny ~ pi_rat+black, data=hmda, family = binomial(link="probit"))
coeftest(bm1)
z test of coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.258787 0.136691 -16.5248 < 2.2e-16 ***
pi_rat 2.741779 0.380469 7.2063 5.749e-13 ***
blackTRUE 0.708155 0.083352 8.4959 < 2.2e-16 ***
---
Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1
predict(bm1, data.frame(pi_rat=.3,black=FALSE),type = "response")
0.07546516
predict(bm1, data.frame(pi_rat=.3,black=TRUE),type = "response")
0.2332769
\end{semiverbatim}
\begin{itemize}
\item Probit predicts a 7.5\% chance of mortgage denial for non-black applicants, and 23.3\% chance for black ones.
\end{itemize}
\end{frame}
\begin{frame}{Why use the logistic CDF?}
Has some nice properties:
\begin{itemize}
\item Gives us more of the `S' shape
\item $Pr(Y=1|X)$ is increasing in $X$ if $\beta_1>0$.
\item $Pr(Y=1|X) \in [0,1]$ for all $X$
\item Easy to compute: $\frac{1}{1+e^{-z}}=\frac{e^{z}}{1+e^{z}}$ has analytic derivatives too.
\item Log odds interpretation
\begin{itemize}
\item $\log(\frac{p}{1-p}) = \beta_0 + \beta_1 X$
\item $\beta_1$ tells us how \alert{log odds ratio} responds to $X$.
\item $\frac{p}{1-p} \in (-\infty,\infty)$ which fixes the $[0,1]$ problem in the other direction.
\item more common in other fields (epidemiology, biostats, etc.).
\end{itemize}
\item Also has the property that $F(z) = 1-F(-z)$.
\item Similar to probit but different scale of coefficients
\item Logit/Logistic are sometimes used interchangeably but sometimes mean different things depending on the literature.
\end{itemize}
\end{frame}
\begin{frame}[fragile]{Logit in R}
\footnotesize
\begin{semiverbatim}
bm1 <-glm(deny~pi_rat+black,data=hmda, family=binomial(link="logit"))
coeftest(bm1)
z test of coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -4.12556 0.26841 -15.3701 < 2.2e-16 ***
pi_rat 5.37036 0.72831 7.3737 1.66e-13 ***
blackTRUE 1.27278 0.14620 8.7059 < 2.2e-16 ***
---
Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1
> predict(bm1, data.frame(pi_rat=.3,black=TRUE),type = "response")
0.2241459
> predict(bm1, data.frame(pi_rat=.3,black=FALSE),type = "response")
0.07485143
\end{semiverbatim}
\begin{itemize}
\item Logit predicts a 7.5\% chance of mortgage denial for non-black applicants, and 22.4\% chance for black ones. (Very similar to probit).
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{A quick comparison}
\begin{itemize}
\item LPM prediction departs greatly from CDF long before $[0,1]$ limits.
\item We get probabilities that are too extreme even for $X\hat{\beta}$ ``in bounds''.
\item Some (MHE) argue that though $\hat{Y}$ is flawed, constant marginal effects are still OK.
\item Logit and Probit are highly similar
\end{itemize}
\begin{center}
\includegraphics[width=2in]{resources/lpm-probit.jpg}
\end{center}
\end{frame}
\begin{frame}
\begin{center}
\includegraphics[width=2.5in]{resources/hmda3.pdf}\\
\includegraphics[width=2.5in]{resources/hmda3a.pdf}\\
\end{center}
\end{frame}
\begin{frame}
\frametitle{Latent Variables/ Limited Dependent Variables}
An alternative way to think about this problem is that there is a continuously distributed $Y^{*}$ that we as the econometrician don't observe.
\begin{eqnarray*}
Y_i =
\begin{cases}
1 \mbox{ if } Y^{*} >0 \\
0 \mbox{ if } Y^{*} \leq 0
\end{cases}
\end{eqnarray*}
\begin{itemize}
\item Instead we only see whether $Y^{*}$ exceeds some threshold (in this case $0$).
\item We can think about $Y^{*}$ as a \alert{latent variable}.
\item Sometimes you will see this description in the literature, everything else is the same!
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Index Models}
We sometimes call these single index models or threshold crossing models
\begin{eqnarray*}
Z_i = X_i \beta
\end{eqnarray*}
\begin{itemize}
\item We start with a potentially large number of regressors in $X_i$ but $X_i \beta = Z_i$ is a \alert{scalar}
\item We can just calculate $F(Z_i)$ for Logit or Probit (or some other CDF).
\item $Z_i$ is the \alert{index}. if $Z_i = X_i \beta$ we say it is a \alert{linear index} model.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{What does software do?}
\begin{itemize}
\item One temptation might be \alert{nonlinear least squares}:
\begin{eqnarray*}
\hat{\beta}^{NLLS} = \arg \min_{\beta} \sum_{i=1}^N (Y_i - \Phi(X_i \beta))^2
\end{eqnarray*}
\item Turns out this isn't what people do.
\item We can't always directly estimate using the log-odds
\begin{eqnarray*}
\log\left(\frac{p}{1-p}\right)= \beta X_i + \varepsilon_i
\end{eqnarray*}
\item The problem is that $p$ or $p(X_i)$ isn't really observed.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{What does software do?}
\begin{itemize}
\item Can construct an MLE:
\begin{eqnarray*}
\hat{\beta}^{MLE} &=& \arg \max_{\beta} \prod_{i=1}^N F(Z_i)^{y_i} (1-F(Z_i))^{1-{y_i} }\\
Z_i &=& \beta_0 + \beta_1 X_i
\end{eqnarray*}
\item Probit: $F(Z_i) = \Phi(Z_i)$ and its derivative (density) $f(Z_i) = \phi(Z_i)$. \\
Also is \alert{symmetric} so that $1 - F(Z_i) = F(-Z_i)$.
\item Logit: $F(Z_i) = \frac{1}{1+e^{-z}}$ and its derivative (density) $f(Z_i) = \frac{e^{-z}}{(1+e^{-z})^2}$ a more convenient property is that $\frac{f(z)}{F(z)} = 1 - F(z)$ this is called the \alert{hazard rate}.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{A probit trick}
Let $q_i = 2 y_i -1$
\begin{eqnarray*}
F(q_i \cdot Z_i) =
\begin{cases}
F(Z_i) &\mbox{ when } y_i=1 \\
F(-Z_i) = 1-F(Z_i)& \mbox{ when } y_i=0
\end{cases}
\end{eqnarray*}
So that
\begin{eqnarray*}
\ell(y_1,\ldots, y_n | \beta) = \sum_{i=1}^N \ln F(q_i \cdot Z_i)
\end{eqnarray*}
\end{frame}
\begin{frame}
\frametitle{FOC of Log-Likelihood}
\begin{eqnarray*}
\ell(y_1,\ldots,y_n | \beta) &=& \sum_{i=1}^N y_i \ln F(Z_i) + (1-y_i) \ln(1- F(Z_i)) \\
\frac{\partial l }{\partial \beta} &=& \sum_{i=1}^N \frac{y_i}{ F(Z_i)} \frac{ d F}{d \beta}(Z_i) - \frac{1-y_i}{1-F(Z_i)} \frac{ d F}{d \beta} (Z_i)\\
&=& \sum_{i=1}^N \frac{y_i \cdot f(Z_i) }{ F(Z_i)} \frac{ d Z_i}{d \beta} - \sum_{i=1}^N \frac{(1-y_i)\cdot f(Z_i) }{1-F(Z_i)} \frac{ d Z_i}{d \beta} \\
&=& \sum_{i=1}^N \left[ \frac{y_i \cdot f(Z_i) }{ F(Z_i)} X_i - \frac{(1-y_i)\cdot f(Z_i) }{1-F(Z_i)} X_i \right] \\
\end{eqnarray*}
\end{frame}
\begin{frame}
\frametitle{FOC of Log-Likelihood (Logit)}
This is the \alert{score} of the log-likelihood:
\begin{eqnarray*}
\frac{\partial l }{\partial \beta} = \nabla_{\beta} \cdot \ell(\mathbf{y}; \beta) &=& \sum_{i=1}^N \left[ y_i \frac{ f(Z_i) }{ F(Z_i)} - (1-y_i) \frac{f(Z_i) }{1-F(Z_i)} \right] \cdot X_i
\end{eqnarray*}
It is technically also a \alert{moment condition}. It is easy for the logit
\begin{eqnarray*}
\nabla_{\beta} \cdot \ell(\mathbf{y}; \beta) &=& \sum_{i=1}^N \left[ y_i (1-F(Z_i)) - (1-y_i) F(Z_i) \right] \cdot X_i \\
&=& \sum_{i=1}^N \underbrace{\left[ y_i - F(Z_i) \right]}_{\varepsilon_i} \cdot X_i
\end{eqnarray*}
This comes from the hazard rate.
\end{frame}
\begin{frame}
\frametitle{FOC of Log-Likelihood (Probit)}
This is the \alert{score} of the log-likelihood:
\begin{eqnarray*}
\frac{\partial l }{\partial \beta} = \nabla_{\beta} \cdot \ell(\mathbf{y}; \beta) &=& \sum_{i=1}^N \left[ y_i \frac{ f(Z_i) }{ F(Z_i)} - (1-y_i) \frac{f(Z_i) }{1-F(Z_i)} \right] \cdot X_i \\
&=& \sum_{y_i=1} \frac{\phi(Z_i) }{ \Phi(Z_i)} X_i +\sum_{y_i=0} \frac{-\phi(Z_i) }{1-\Phi(Z_i)} X_i
\end{eqnarray*}
Using the $q_i = 2 y_i -1$ trick
\begin{eqnarray*}
\nabla_{\beta} \cdot \ell(\mathbf{y}; \beta)= \sum_{i=1}^N \underbrace{\frac{ q_i \phi(q_i Z_i)}{\Phi(Z_i)}}_{\lambda_i} X_i
\end{eqnarray*}
\end{frame}
\begin{frame}
\frametitle{The Hessian Matrix}
We could also take second derivatives to get the \alert{Hessian} matrix:
\begin{eqnarray*}
\frac{\partial l^2 }{\partial \beta \partial \beta'} = - \sum_{i=1}^N y_i \frac{ f(Z_i) f(Z_i) - f'(Z_i) F(Z_i) }{ F(Z_i)^2} X_i X_i' \\
+ \sum_{i=1}^N (1-y_i) \frac{f(Z_i)f(Z_i) - f'(Z_i)(1-F(Z_i))}{(1-F(Z_i))^2} X_i X_i'
\end{eqnarray*}
This is a $K\times K$ matrix where $K$ is the dimension of $X$ or $\beta$.
\end{frame}
\begin{frame}
\frametitle{The Hessian Matrix (Logit)}
For the logit this is even easier (use the simplified logit score):
\begin{eqnarray*}
\frac{\partial l^2 }{\partial \beta \partial \beta'} &=& - \sum_{i=1}^N f(Z_i) X_i X_i' \\
&=& - \sum_{i=1}^N F(Z_i) (1- F(Z_i)) X_i X_i'
\end{eqnarray*}
This is \alert{negative semi definite}
\end{frame}
\begin{frame}
\frametitle{The Hessian Matrix (Probit)}
Recall
\begin{eqnarray*}
\nabla_{\beta} \cdot \ell(\mathbf{y}; \beta)= \sum_{i=1}^N \underbrace{\frac{ q_i \phi(q_i Z_i)}{\Phi(Z_i)}}_{\lambda_i} X_i
\end{eqnarray*}
Take another derivative and recall $\phi'(z_i) = - z_i \phi(z_i)$
\begin{eqnarray*}
\nabla_{\beta}^2 \cdot \ell(\mathbf{y}; \beta)&=& \sum_{i=1}^N \frac{q_i \phi'(q_i Z_i) \Phi(z_i) - q_i \phi(z_i)^2}{\Phi(z_i)^2} X_i X_i' \\
&=& - \lambda_i( z_i + \lambda_i) \cdot X_i X_i'
\end{eqnarray*}
Hard to show but this is \alert{negative definite} too.
\end{frame}
\begin{frame}
\frametitle{Estimation}
\begin{itemize}
\item We can try to find the values of $\beta$ which make the average score $=0$ (the FOC).
\item But no closed form solution!
\item Recall Taylor's Rule:
\end{itemize}
\begin{eqnarray*}
f(x + \Delta x) = f(x_0) + f'(x_0) \Delta x + \frac{1}{2} f''(x_0) (\Delta x)^2
\end{eqnarray*}
Goal is to find the case where $f'(x) \approx 0$ so take derivative w.r.t $\Delta x$:
\begin{eqnarray*}
\frac{d}{d \Delta x} \left[ f(x_0) + f'(x_0) \Delta x + \frac{1}{2} f''(x_0) (\Delta x)^2 \right] = f'(x_0) + f''(x_0) (\Delta x) = 0
\end{eqnarray*}
Solve for $\Delta x$
\begin{eqnarray*}
\Delta x = - f'(x_0) / f''(x_0)
\end{eqnarray*}
\end{frame}
\begin{frame}
\frametitle{Estimation}
\begin{itemize}
\item In multiple dimensions this becomes:
\begin{eqnarray*}
x_{n+1} = x_{n} - \alpha \cdot \left[\mathbf{H}_f (x_n) \right]^{-1} \nabla f(x_n)
\end{eqnarray*}
\item $\mathbf{H}_f (x_n)$ is the \alert{Hessian} Matrix. $ \nabla f(x_n)$ is the \alert{gradient}.
\item $\alpha \in [0,1]$ is a parameter that determines \alert{step size}
\item Idea is that we approximate the likelihood with a quadratic function and minimize that (because we know how to solve those).
\item Each step we update our quadratic approximation.
\item If problem is \alert{convex} this will always converge (and quickly)
\item Most software ``cheats'' and doesn't compute $\left[\mathbf{H}_f (x_n) \right]^{-1}$ but uses tricks to update on the fly (BFGS, Broyden, DFP, SR1). Mostly you see these options in your software.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Marginal effects}
\begin{eqnarray*}
\frac{\partial E[Y_i | X_i] }{\partial X_{ik}} = f (Z_i) \beta_k
\end{eqnarray*}
\begin{itemize}
\item The whole point was that we wanted marginal effects not to be constant
\item So where do we evaluate?
\begin{itemize}
\item Software often plugs in mean or median values for each component
\item Alternatively we can integrate over $X$ and compute:
$$
E_{X_i}[ f (Z_i) \beta_k]
$$
\item The right thing to do is probably to plot the response surface (either probability) or change in probability over all $X$.
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Inference}
\begin{itemize}
\item If we have the Hessian Matrix, inference is straightforward.
\item $\mathbf{H}_f(\hat{\beta}^{MLE})$ tells us about the \alert{curvature} of the log-likelihood around the maximum.
\begin{itemize}
\item Function is flat $\rightarrow$ not very precise estimates of parameters
\item Function is steep $\rightarrow$ precise estimates of parameters
\end{itemize}
\item Construct \alert{Fisher Information} $I(\hat{\beta}^{MLE}) = E[H_f(\hat{\beta}^{MLE})]$ where expectation is over the data.
\begin{itemize}
\item Logit does not depend on $y_i$ so $E[H_f(\hat{\beta}^{MLE})]=H_f(\hat{\beta}^{MLE})$.
\item Probit does depend on $y_i$ so $E[H_f(\hat{\beta}^{MLE})]\neq H_f(\hat{\beta}^{MLE})$.
\end{itemize}
\item Inverse Fisher information $E[H_f(\hat{\beta}^{MLE})]^{-1}$ is an estimate of the variance covariance matrix for $\hat{\beta}$.
\item $\sqrt{diag[E[H_f(\hat{\beta}^{MLE})]^{-1}]}$ is an estimate for $SE(\hat{\beta})$.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Goodness of Fit \#1: Pseudo $R^2$}
How well does the model fit the data?
\begin{itemize}
\item No $R^2$ measure (why not?).
\item Well we have likelihood units so average likelihood tells us something but is hard to interpret.
\item $\rho = 1- \frac{LL(\hat{\beta}^{MLE})}{LL(\beta_0)}$ where $LL(\beta_0)$ is the likelihood of a model with just a constant (unconditional probability of success).
\begin{itemize}
\item If we don't do any better than unconditional mean then $\rho=0$.
\item Won't ever get all of the way to $\rho =1$.
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Goodness of Fit \#2: Confusion Matrix }
\begin{itemize}
\item Machine learning likes to think about this problem more like \alert{classification} then regression.
\item A caution: these are \alert{regression} models not \alert{classification} models.
\item Predict either $\hat{y}_i = 1$ or $\hat{y}_i = 0$ for each observation.