// Figure 1
set seed 1
// Create file to store simulation results
tempname foo
postfile `foo' b using clt, replace
// Simulate analyses
forvalues i = 1/10000 {
drop _all
qui set obs 100
generate x = rnormal()
generate e = rnormal()
generate y = .4 * x + e
qui regress y x
local b = _b[x]
post `foo' (`b')
}
postclose `foo'
// Open results from simulations and plot
use clt, clear
histogram b, freq ytitle("Frequency of b value") xtitle("Values of b") name(figure1, replace)
// Figure 2
set seed 1
// Create file to store simulation results
tempname foo
postfile `foo' r2_50 r2_100 r2_150 r2_200 using overfitting, replace
// Simulate analyses
forvalues i = 1/10000 {
drop _all
set obs 10000
generate y = rnormal()
foreach i of numlist 1/15 {
generate x`i' = rnormal()
}
foreach j of numlist 50 100 150 200 {
preserve
sample `j', count
qui reg y x*
local r2_`j' = e(r2)
restore
}
post `foo' (`r2_50') (`r2_100') (`r2_150') (`r2_200')
}
postclose `foo'
// Open results from simulations
use overfitting, clear
// Plot
twoway (kdensity r2_200) ///
(kdensity r2_150) ///
(kdensity r2_100) ///
(kdensity r2_50) ///
, ytitle("Percent of samples") ///
xtitle("R-square value from regression model") ///
xlabel(0 (.1) .6) ///
ylabel(0 (2) 20) ///
legend(order(1 "ca. 13 cases/predictor ({it:N} = 200)" ///
2 "10 cases/predictor ({it:N} = 150)" ///
3 "ca. 7 cases/predictor ({it:N} = 100)" ///
4 "ca. 3 cases/predictor ({it:N} = 50)") ///
pos(2) ring(0)) name(figure2, replace)
// Figure 2
set seed 1
// Create file to store simulation results
tempname foo
postfile `foo' r2_50 r2_100 r2_150 r2_200 using overfitting, replace
// Simulate analyses
forvalues i = 1/10000 {
drop _all
set obs 10000
generate y = rnormal()
foreach i of numlist 1/15 {
generate x`i' = rnormal()
}
foreach j of numlist 50 100 150 200 {
preserve
sample `j', count
qui reg y x*
local r2_`j' = e(r2)
restore
}
post `foo' (`r2_50') (`r2_100') (`r2_150') (`r2_200')
}
postclose `foo'
// Open results from simulations
use overfitting, clear
// Plot
twoway (kdensity r2_200) ///
(kdensity r2_150) ///
(kdensity r2_100) ///
(kdensity r2_50) ///
, ytitle("Percent of samples") ///
xtitle("R-square value from regression model") ///
xlabel(0 (.1) .6) ///
ylabel(0 (2) 20) ///
legend(order(1 "ca. 13 cases/predictor ({it:N} = 200)" ///
2 "10 cases/predictor ({it:N} = 150)" ///
3 "ca. 7 cases/predictor ({it:N} = 100)" ///
4 "ca. 3 cases/predictor ({it:N} = 50)") ///
pos(2) ring(0)) name(figure2, replace)
// Figure 4 (actually Table 1)
clear
set seed 1
// Create file to store simulation results
tempname foo
postfile `foo' n correlation typei using dichotomization, replace
// Simulate analyses
forvalues i = 1/10000 {
drop _all
foreach j of numlist 50 100 200 {
foreach k of numlist 0 .3 .5 .7 {
qui drawnorm x1 x2, ///
n(`j') ///
corr(1, `k', 1) cstorage(lower) ///
clear
generate e = rnormal()
generate y = .5*x1 + 0*x2 + e
qui sum x1, detail
generate x1s = (x1 > r(p50))
qui sum x2, detail
generate x2s = (x2 > r(p50))
qui reg y x1s x2s
local typei = _b[x2s]/_se[x2s]
local sig = (abs(`typei') > 1.96)
*di _b[x2s] _skip(5) _se[x2s] _skip(5) `typei' _skip(5) `sig'
post `foo' (`j') (`k') (`sig')
}
}
}
postclose `foo'
// Open results from simulations
use dichotomization, clear
collapse typei, by(n correlation)
graph hbar typei, over(n, relabel(1 "{it:N} = 50" 2 "{it:N} = 100" 3 "{it:N} = 200")) ///
over(correlation, relabel(1 "{it:Corr(x{sub:1}, x{sub:2})} = 0" ///
2 "{it:Corr(x{sub:1}, x{sub:2})} = .3" ///
3 "{it:Corr(x{sub:1}, x{sub:2})} = .5" ///
4 "{it:Corr(x{sub:1}, x{sub:2})} = .7")) ///
ytitle("Type I error rate") yscale(alt) ylabel(, format(%6.2f)) ///
name(figure4, replace)
Jul 29, 2017
OLS regression: overfitting and dichotomization
Babyak (2004) demonstrates a couple of aspects of OLS regression, making use of the following simulations:
Labels:
collapse,
drawnorm,
graph hbar,
Histogram,
Simulation,
Textbooks,
twoway kdensity


