LinearRegression.java
6.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
/******************************************************************************
* Compilation: javac LinearRegression.java
* Execution: java LinearRegression
* Dependencies: none
*
* Compute least squares solution to y = beta * x + alpha.
* Simple linear regression.
*
******************************************************************************/
package edu.princeton.cs.algs4;
/**
* The {@code LinearRegression} class performs a simple linear regression
* on an set of <em>n</em> data points (<em>y<sub>i</sub></em>, <em>x<sub>i</sub></em>).
* That is, it fits a straight line <em>y</em> = α + β <em>x</em>,
* (where <em>y</em> is the response variable, <em>x</em> is the predictor variable,
* α is the <em>y-intercept</em>, and β is the <em>slope</em>)
* that minimizes the sum of squared residuals of the linear regression model.
* It also computes associated statistics, including the coefficient of
* determination <em>R</em><sup>2</sup> and the standard deviation of the
* estimates for the slope and <em>y</em>-intercept.
*
* @author Robert Sedgewick
* @author Kevin Wayne
*/
public class LinearRegression {
private final double intercept, slope;
private final double r2;
private final double svar0, svar1;
/**
* Performs a linear regression on the data points {@code (y[i], x[i])}.
*
* @param x the values of the predictor variable
* @param y the corresponding values of the response variable
* @throws IllegalArgumentException if the lengths of the two arrays are not equal
*/
public LinearRegression(double[] x, double[] y) {
if (x.length != y.length) {
throw new IllegalArgumentException("array lengths are not equal");
}
int n = x.length;
// first pass
double sumx = 0.0, sumy = 0.0, sumx2 = 0.0;
for (int i = 0; i < n; i++) {
sumx += x[i];
sumx2 += x[i]*x[i];
sumy += y[i];
}
double xbar = sumx / n;
double ybar = sumy / n;
// second pass: compute summary statistics
double xxbar = 0.0, yybar = 0.0, xybar = 0.0;
for (int i = 0; i < n; i++) {
xxbar += (x[i] - xbar) * (x[i] - xbar);
yybar += (y[i] - ybar) * (y[i] - ybar);
xybar += (x[i] - xbar) * (y[i] - ybar);
}
slope = xybar / xxbar;
intercept = ybar - slope * xbar;
// more statistical analysis
double rss = 0.0; // residual sum of squares
double ssr = 0.0; // regression sum of squares
for (int i = 0; i < n; i++) {
double fit = slope*x[i] + intercept;
rss += (fit - y[i]) * (fit - y[i]);
ssr += (fit - ybar) * (fit - ybar);
}
int degreesOfFreedom = n-2;
r2 = ssr / yybar;
double svar = rss / degreesOfFreedom;
svar1 = svar / xxbar;
svar0 = svar/n + xbar*xbar*svar1;
}
/**
* Returns the <em>y</em>-intercept α of the best of the best-fit line <em>y</em> = α + β <em>x</em>.
*
* @return the <em>y</em>-intercept α of the best-fit line <em>y = α + β x</em>
*/
public double intercept() {
return intercept;
}
/**
* Returns the slope β of the best of the best-fit line <em>y</em> = α + β <em>x</em>.
*
* @return the slope β of the best-fit line <em>y</em> = α + β <em>x</em>
*/
public double slope() {
return slope;
}
/**
* Returns the coefficient of determination <em>R</em><sup>2</sup>.
*
* @return the coefficient of determination <em>R</em><sup>2</sup>,
* which is a real number between 0 and 1
*/
public double R2() {
return r2;
}
/**
* Returns the standard error of the estimate for the intercept.
*
* @return the standard error of the estimate for the intercept
*/
public double interceptStdErr() {
return Math.sqrt(svar0);
}
/**
* Returns the standard error of the estimate for the slope.
*
* @return the standard error of the estimate for the slope
*/
public double slopeStdErr() {
return Math.sqrt(svar1);
}
/**
* Returns the expected response {@code y} given the value of the predictor
* variable {@code x}.
*
* @param x the value of the predictor variable
* @return the expected response {@code y} given the value of the predictor
* variable {@code x}
*/
public double predict(double x) {
return slope*x + intercept;
}
/**
* Returns a string representation of the simple linear regression model.
*
* @return a string representation of the simple linear regression model,
* including the best-fit line and the coefficient of determination
* <em>R</em><sup>2</sup>
*/
public String toString() {
StringBuilder s = new StringBuilder();
s.append(String.format("%.2f n + %.2f", slope(), intercept()));
s.append(" (R^2 = " + String.format("%.3f", R2()) + ")");
return s.toString();
}
}
/******************************************************************************
* Copyright 2002-2016, Robert Sedgewick and Kevin Wayne.
*
* This file is part of algs4.jar, which accompanies the textbook
*
* Algorithms, 4th edition by Robert Sedgewick and Kevin Wayne,
* Addison-Wesley Professional, 2011, ISBN 0-321-57351-X.
* http://algs4.cs.princeton.edu
*
*
* algs4.jar is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* algs4.jar is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with algs4.jar. If not, see http://www.gnu.org/licenses.
******************************************************************************/