{"cells":[{"cell_type":"markdown","source":["Using the generated data, find matrix $V$ and the vectors that describe the new coordinate system in reduced dimension $3$ using either SVD or eigenvectors & eigenvalues. Graph the data and calculate how much the training set's variance has diminished. Have you preserved $90 \\%$ of the variance?\n","\n","\n","\n"],"metadata":{"id":"RgPTKyZh_6Y6"}},{"cell_type":"code","source":["import numpy as np\n","import matplotlib.pyplot as plt\n","from mpl_toolkits.mplot3d import Axes3D # Import 3D plotting module\n","\n","# Enable interactive mode for Google Colab\n","%matplotlib notebook\n","\n","# Install Plotly\n","!pip install plotly\n","\n","# Import Plotly\n","import plotly.express as px\n","\n","# Set a random seed for reproducibility\n","np.random.seed(42)\n","\n","# Define the number of vectors you want to generate\n","num_vectors = 100\n","\n","# Generate random values for the first dimension\n","dim1 = np.random.randn(num_vectors)\n","\n","# Create the second dimension with dependency on the first dimension and random noise\n","dependency_factor_dim2 = 0.7\n","noise_dim2 = np.random.randn(num_vectors)\n","dim2 = dependency_factor_dim2 * dim1 + 2 + noise_dim2\n","\n","# Create the third dimension with dependency on the second dimension and random noise\n","dependency_factor_dim3 = -1.1\n","noise_dim3 = np.random.randn(num_vectors)\n","dim3 = dependency_factor_dim3 * dim2 - 4 + noise_dim3\n","\n","# Generate random values for the fourth dimension\n","dim4 = np.random.randn(num_vectors)\n","\n","# Create a 4-dimensional vector by stacking the dimensions horizontally\n","B = np.column_stack((dim1, dim2, dim3, dim4))\n","\n","# Create a 3D scatter plot of the first three dimensions using (allows you to rotate the figure with your mouse)\n","fig = px.scatter_3d(x=dim1, y=dim2, z=dim3, color=dim4)\n","fig.update_layout(title='Interactive 3D Scatter Plot of Dimensions 1, 2, 3', scene=dict(xaxis_title='x_1', yaxis_title='x_2', zaxis_title='x_3'))\n","\n","# Show the plot\n","fig.show()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":594},"id":"Wp5UoGVZ0W_X","executionInfo":{"status":"ok","timestamp":1703795602937,"user_tz":300,"elapsed":9515,"user":{"displayName":"Joseph Sketl","userId":"03428201371779072895"}},"outputId":"9baa378b-5d4d-44f0-8240-749d1c871bd0"},"execution_count":1,"outputs":[{"output_type":"stream","name":"stdout","text":["Requirement already satisfied: plotly in /usr/local/lib/python3.10/dist-packages (5.15.0)\n","Requirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from plotly) (8.2.3)\n","Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from plotly) (23.2)\n"]},{"output_type":"display_data","data":{"text/html":["\n","\n","\n","
\n","
\n","\n",""]},"metadata":{}}]},{"cell_type":"code","source":["#SVD Solution\n","\n","#Calculate the sample means for each column\n","column_means = np.mean(B, axis=0)\n","\n","#Normalized matrix B_normalized\n","B_normalized = B - np.tile(column_means,(B.shape[0],1))\n","\n","#Perform SVD on our normalized feature matrix\n","U, S, VT = np.linalg.svd(B_normalized)\n","\n","# U: Left singular vectors (100 x 100)\n","# S: Singular values in a vector (already sorted in descending order)\n","# VT: (V transposed) Right singular vectors (4 x 4)\n","\n","V_top_three = VT.T[:,0:3]\n","\n","#Project B_normalized features onto the vectors associated with the three largest singular values\n","T = B_normalized @ V_top_three\n","\n","#Graph\n","fig = px.scatter_3d(x=T[:,0], y=T[:,1], z=T[:,2])\n","fig.update_layout(title='3D Scatter Plot of Dimensions 1, 2, 3 of T', scene=dict(xaxis_title='t_1', yaxis_title='t_2', zaxis_title='t_3'))\n","\n","fig.show()\n","\n","#Matrix V\n","print('Matrix V:')\n","print(V_top_three)\n","\n","#SVD Solution\n","\n","#Calculate the variance of each column before PCA\n","column_variances_of_B_normalized = np.var(B_normalized, axis=0)\n","\n","#Sum of the variances before PCA\n","total_variance_original = np.sum(column_variances_of_B_normalized)\n","print(\"\\nSum of the variances of each column before PCA:\", total_variance_original)\n","\n","#Variances after PCA\n","column_variances_of_T = np.var(T, axis=0)\n","\n","# Sum the variances after PCA\n","total_variance_best_3_new_features = np.sum(column_variances_of_T)\n","print(\"\\nSum of the variances of each column after PCA:\", total_variance_best_3_new_features)\n","\n","#Ratio\n","print(\"\\nProportion of variance preserved after PCA:\", total_variance_best_3_new_features/total_variance_original)"],"metadata":{"id":"007agfuwp0as","colab":{"base_uri":"https://localhost:8080/","height":733},"executionInfo":{"status":"ok","timestamp":1703795602937,"user_tz":300,"elapsed":3,"user":{"displayName":"Joseph Sketl","userId":"03428201371779072895"}},"outputId":"6151496f-69db-4178-ffc5-ecb7457f1713"},"execution_count":2,"outputs":[{"output_type":"display_data","data":{"text/html":["\n","\n","\n","
\n","
\n","\n",""]},"metadata":{}},{"output_type":"stream","name":"stdout","text":["Matrix V:\n","[[-0.21543116 0.69557149 -0.48414346]\n"," [-0.5217725 0.21156703 -0.25244289]\n"," [ 0.82179994 0.3710635 -0.2108495 ]\n"," [ 0.0773805 -0.57769505 -0.81081452]]\n","\n","Sum of the variances of each column before PCA: 5.062407538235173\n","\n","Sum of the variances of each column after PCA: 4.753839190050946\n","\n","Proportion of variance preserved after PCA: 0.9390471142724715\n"]}]},{"cell_type":"code","source":["#Eigenvectors & Eigenvalues Solution\n","\n","#Sigma\n","Sigma = B_normalized.T @ B_normalized\n","\n","#Eigens\n","eigenvalues, eigenvectors = np.linalg.eig(Sigma)\n","\n","#Sort Eigens\n","sorted_indices = np.argsort(eigenvalues)[::-1]\n","eigenvalues = eigenvalues[sorted_indices]\n","eigenvectors = eigenvectors[:, sorted_indices]\n","\n","print(\"Eigenvalues:\")\n","print(eigenvalues)\n","print(\"\\nEigenvectors:\")\n","print(eigenvectors)\n","\n","#Matrix V\n","V=eigenvectors[:, 0:3]\n","print('\\nMatrix V: ')\n","print(V)\n","\n","#Z\n","Z = V.T @ B_normalized.T\n","Z = Z.T\n","\n","#Graph\n","fig = px.scatter_3d(x=Z[:,0], y=Z[:,1], z=Z[:,2])\n","fig.update_layout(title='Eigen 3D Scatter Plot of Dimensions 1, 2, 3 of Z', scene=dict(xaxis_title='z_1', yaxis_title='z_2', zaxis_title='z_3'))\n","fig.show()\n","\n","#Eigen Solution\n","\n","#Variances after PCA\n","column_variances_of_Z = np.var(Z, axis=0)\n","\n","#Sum of the variances after PCA\n","total_variance_best_3_new_features = np.sum(column_variances_of_Z)\n","print(\"Sum of the variances of each column after PCA:\", total_variance_best_3_new_features)\n","\n","#Ratio\n","print(\"Proportion of variance preserved after PCA:\", total_variance_best_3_new_features/total_variance_original)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":820},"id":"070V2rj_FkRT","executionInfo":{"status":"ok","timestamp":1703795603155,"user_tz":300,"elapsed":220,"user":{"displayName":"Joseph Sketl","userId":"03428201371779072895"}},"outputId":"e41a5a87-b017-436b-8532-40c831c77b75"},"execution_count":3,"outputs":[{"output_type":"stream","name":"stdout","text":["Eigenvalues:\n","[316.3987448 89.96592894 69.01924526 30.85683482]\n","\n","Eigenvectors:\n","[[ 0.21543116 0.69557149 0.48414346 0.48515444]\n"," [ 0.5217725 0.21156703 0.25244289 -0.7869342 ]\n"," [-0.82179994 0.3710635 0.2108495 -0.3774907 ]\n"," [-0.0773805 -0.57769505 0.81081452 0.05348365]]\n","\n","Matrix V: \n","[[ 0.21543116 0.69557149 0.48414346]\n"," [ 0.5217725 0.21156703 0.25244289]\n"," [-0.82179994 0.3710635 0.2108495 ]\n"," [-0.0773805 -0.57769505 0.81081452]]\n"]},{"output_type":"display_data","data":{"text/html":["\n","\n","\n","
\n","
\n","\n",""]},"metadata":{}},{"output_type":"stream","name":"stdout","text":["Sum of the variances of each column after PCA: 4.753839190050945\n","Proportion of variance preserved after PCA: 0.9390471142724713\n"]}]},{"cell_type":"markdown","source":["Yes, 93.9% variance is retained."],"metadata":{"id":"dl2X24Xjm5WH"}}],"metadata":{"colab":{"provenance":[{"file_id":"1wfxb1QDhWxYs-U9lPLDgHcQXuCCwqWZu","timestamp":1703619687944},{"file_id":"1EuBK79vTWZU6igWlK2lQbEb_TsNG3_5V","timestamp":1701370449107},{"file_id":"1upO3Xz4Y6av-7xfP7VvvHfoGxdyuy0et","timestamp":1700357330905},{"file_id":"1Mt2rj2dwqqLPL96qYBVsh3Rt8rLF5A34","timestamp":1687918324644},{"file_id":"1gT2fhEeqw0PuEVlGulldHYbtNpAvdXEA","timestamp":1673624448116},{"file_id":"1c_RHyXQa6_Ey9Sk2yPLeAP-Al1UqTs_5","timestamp":1668914028956},{"file_id":"1ejvKvJeMpU9qzK1CBPKCqBjfD9Rw7rMN","timestamp":1668908178786}]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":0}