# Train/Test split
def train_test_split(X, y, split_size=0.2):
# Generate shuffled indices
num_data_points = X.shape[0]
shuffled_indices = np.random.permutation(num_data_points)
# Split indices into test and train
test_set_size = int(num_data_points * split_size)
test_indices = shuffled_indices[:test_set_size]
train_indices = shuffled_indices[test_set_size:]
# Use the indices to split the data
X_train = X.iloc[train_indices]
X_test = X.iloc[test_indices]
y_train = y.iloc[train_indices]
y_test = y.iloc[test_indices]
return X_train, X_test, y_train, y_test
# Calculate mean squared error (using for loops, todo: convert to matrices)
def mse(X, y, w, b):
num_rows, num_features = X.shape
mse = 0
# Run the predictions for X (yi = wi * X + b)
for i in range(num_rows):
y_i = 0
for j in range(num_features):
y_i += X.iloc[i,j] * w[j] # Prediction for a single row
y_i += b # Add bias
# Calculate the cumulative squared loss
mse += (y_i - y.iloc[i].item()) ** 2
# Return mean error
return mse / num_rows
# Z-score standardization
def standardize(X_train, X_test):
# Calculate the mean and standard deviation of the training set
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)
# To prevent division by zero, set std=1 where std is zero
std[std == 0] = 1
# Standardize the training set
X_train_scaled = (X_train - mean) / std
# Use the training set's mean and std to standardize the test set
X_test_scaled = (X_test - mean) / std
return X_train_scaled, X_test_scaled
# Stochastic Gradient Descent (SGD) implementation
def sgd(X, y, learning_rate=0.01, n_iterations=1000):
m, n = X.shape
theta = np.random.randn(n, 1) # Initialize theta with random values
y = y.values.reshape(-1, 1) # Reshape y to be a column vector
mse_history = [] # List to store MSE values over iterations
for iteration in range(n_iterations):
for i in range(m):
random_index = np.random.randint(m)
Xi = X[random_index:random_index+1]
yi = y[random_index:random_index+1]
gradients = 2 * Xi.T.dot(Xi.dot(theta) - yi)
theta = theta - learning_rate * gradients
# b = b - learning_rate * gradient_b where gradient_b = -2/n * np.sum(error)
# Store MSE for every iteration
mse = np.mean((X.dot(theta) - y) ** 2)
mse_history.append(mse)
if iteration % 100 == 0 and iteration != 0:
print(f"Iteration {iteration}: MSE = {mse}")
return theta, mse_history