]> git.example.dev Git - research-obsidian.git/commitdiff
ml notes even more
author2weiEmu <saalbach.robert@outlook.de>
Wed, 5 Nov 2025 17:17:59 +0000 (18:17 +0100)
committer2weiEmu <saalbach.robert@outlook.de>
Wed, 5 Nov 2025 17:17:59 +0000 (18:17 +0100)
58 files changed:
.obsidian/appearance.json
.obsidian/workspace.json
Pasted image 20251105142838.png [new file with mode: 0644]
Pasted image 20251105143024.png [new file with mode: 0644]
Pasted image 20251105143447.png [new file with mode: 0644]
Pasted image 20251105143524.png [new file with mode: 0644]
Pasted image 20251105143906.png [new file with mode: 0644]
Pasted image 20251105144121.png [new file with mode: 0644]
Pasted image 20251105144203.png [new file with mode: 0644]
Pasted image 20251105144311.png [new file with mode: 0644]
Pasted image 20251105144526.png [new file with mode: 0644]
Pasted image 20251105144615.png [new file with mode: 0644]
Pasted image 20251105144731.png [new file with mode: 0644]
Pasted image 20251105151944.png [new file with mode: 0644]
Pasted image 20251105152000.png [new file with mode: 0644]
Pasted image 20251105152032.png [new file with mode: 0644]
Pasted image 20251105152429.png [new file with mode: 0644]
Pasted image 20251105152455.png [new file with mode: 0644]
Pasted image 20251105152915.png [new file with mode: 0644]
Pasted image 20251105153029.png [new file with mode: 0644]
Pasted image 20251105153138.png [new file with mode: 0644]
Pasted image 20251105153529.png [new file with mode: 0644]
Pasted image 20251105153632.png [new file with mode: 0644]
Pasted image 20251105153851.png [new file with mode: 0644]
Pasted image 20251105154154.png [new file with mode: 0644]
Pasted image 20251105164313.png [new file with mode: 0644]
Pasted image 20251105164449.png [new file with mode: 0644]
Pasted image 20251105164828.png [new file with mode: 0644]
Pasted image 20251105164842.png [new file with mode: 0644]
Pasted image 20251105164856.png [new file with mode: 0644]
Pasted image 20251105165950.png [new file with mode: 0644]
Pasted image 20251105170031.png [new file with mode: 0644]
Pasted image 20251105170216.png [new file with mode: 0644]
Pasted image 20251105170249.png [new file with mode: 0644]
Pasted image 20251105174010.png [new file with mode: 0644]
Pasted image 20251105174017.png [new file with mode: 0644]
Pasted image 20251105174301.png [new file with mode: 0644]
Pasted image 20251105174734.png [new file with mode: 0644]
Pasted image 20251105174752.png [new file with mode: 0644]
Pasted image 20251105175358.png [new file with mode: 0644]
Pasted image 20251105175413.png [new file with mode: 0644]
Pasted image 20251105175449.png [new file with mode: 0644]
Pasted image 20251105175506.png [new file with mode: 0644]
Pasted image 20251105175851.png [new file with mode: 0644]
Pasted image 20251105175900.png [new file with mode: 0644]
Pasted image 20251105180633.png [new file with mode: 0644]
Pasted image 20251105180954.png [new file with mode: 0644]
Pasted image 20251105181214.png [new file with mode: 0644]
Pasted image 20251105181316.png [new file with mode: 0644]
Pasted image 20251105181342.png [new file with mode: 0644]
Pasted image 20251105181354.png [new file with mode: 0644]
Pasted image 20251105181429.png [new file with mode: 0644]
Pasted image 20251105181440.png [new file with mode: 0644]
Pasted image 20251105181448.png [new file with mode: 0644]
Pasted image 20251105181633.png [new file with mode: 0644]
Pasted image 20251105181649.png [new file with mode: 0644]
Pasted image 20251105181742.png [new file with mode: 0644]
University/Machine Learning/Full Notes.md

index 60c2234843f2137228c82a7eef64beadca323a73..9a6027e19629db23a0af51e38552821949176ba6 100644 (file)
@@ -1,8 +1,8 @@
 {
   "theme": "obsidian",
   "accentColor": "#f2281c",
-  "interfaceFontFamily": "Aptos Mono",
-  "textFontFamily": "Aptos Mono",
-  "monospaceFontFamily": "Aptos Mono",
+  "interfaceFontFamily": "Courier New",
+  "textFontFamily": "Courier New",
+  "monospaceFontFamily": "Courier New",
   "cssTheme": "Dawn"
 }
\ No newline at end of file
index 47b87d25159af528c5b36c0af0d92d5d897359bb..1ef08bd2ecf6adf00b1a546dd14b9f890866d1b2 100644 (file)
@@ -4,11 +4,11 @@
     "type": "split",
     "children": [
       {
-        "id": "0d762e903c6b0576",
+        "id": "226611ae14e85609",
         "type": "tabs",
         "children": [
           {
-            "id": "214929be76b06d19",
+            "id": "3e78d839de43b5f0",
             "type": "leaf",
             "state": {
               "type": "markdown",
       "pdf-plus:PDF++: Toggle auto-paste": false
     }
   },
-  "active": "214929be76b06d19",
+  "active": "3e78d839de43b5f0",
   "lastOpenFiles": [
-    "Pasted image 20251104172129.png",
-    "Pasted image 20251104172116.png",
-    "Pasted image 20251104172012.png",
-    "Pasted image 20251104172000.png",
-    "Pasted image 20251104171952.png",
-    "Pasted image 20251104171659.png",
-    "Pasted image 20251104171550.png",
-    "Pasted image 20251104171524.png",
-    "Pasted image 20251104171324.png",
-    "Pasted image 20251104171316.png",
+    "Pasted image 20251105181742.png",
+    "Pasted image 20251105181649.png",
+    "Pasted image 20251105181633.png",
+    "Pasted image 20251105181448.png",
+    "Pasted image 20251105181440.png",
+    "Pasted image 20251105181429.png",
+    "Pasted image 20251105181354.png",
+    "Pasted image 20251105181342.png",
+    "Pasted image 20251105181316.png",
+    "Pasted image 20251105181214.png",
+    "Pasted image 20251105180954.png",
     "University/Machine Learning/Full Notes.md",
-    "Pasted image 20251104170706.png",
     "Untitled 1.md",
     "some_ideas.md",
     "University/Machine Learning",
diff --git a/Pasted image 20251105142838.png b/Pasted image 20251105142838.png
new file mode 100644 (file)
index 0000000..8ef0187
Binary files /dev/null and b/Pasted image 20251105142838.png differ
diff --git a/Pasted image 20251105143024.png b/Pasted image 20251105143024.png
new file mode 100644 (file)
index 0000000..5019d76
Binary files /dev/null and b/Pasted image 20251105143024.png differ
diff --git a/Pasted image 20251105143447.png b/Pasted image 20251105143447.png
new file mode 100644 (file)
index 0000000..1732cb3
Binary files /dev/null and b/Pasted image 20251105143447.png differ
diff --git a/Pasted image 20251105143524.png b/Pasted image 20251105143524.png
new file mode 100644 (file)
index 0000000..f616332
Binary files /dev/null and b/Pasted image 20251105143524.png differ
diff --git a/Pasted image 20251105143906.png b/Pasted image 20251105143906.png
new file mode 100644 (file)
index 0000000..b61505f
Binary files /dev/null and b/Pasted image 20251105143906.png differ
diff --git a/Pasted image 20251105144121.png b/Pasted image 20251105144121.png
new file mode 100644 (file)
index 0000000..ad3a755
Binary files /dev/null and b/Pasted image 20251105144121.png differ
diff --git a/Pasted image 20251105144203.png b/Pasted image 20251105144203.png
new file mode 100644 (file)
index 0000000..b8c6e46
Binary files /dev/null and b/Pasted image 20251105144203.png differ
diff --git a/Pasted image 20251105144311.png b/Pasted image 20251105144311.png
new file mode 100644 (file)
index 0000000..bf48f7e
Binary files /dev/null and b/Pasted image 20251105144311.png differ
diff --git a/Pasted image 20251105144526.png b/Pasted image 20251105144526.png
new file mode 100644 (file)
index 0000000..0fa28ba
Binary files /dev/null and b/Pasted image 20251105144526.png differ
diff --git a/Pasted image 20251105144615.png b/Pasted image 20251105144615.png
new file mode 100644 (file)
index 0000000..cd68fb5
Binary files /dev/null and b/Pasted image 20251105144615.png differ
diff --git a/Pasted image 20251105144731.png b/Pasted image 20251105144731.png
new file mode 100644 (file)
index 0000000..f5caed5
Binary files /dev/null and b/Pasted image 20251105144731.png differ
diff --git a/Pasted image 20251105151944.png b/Pasted image 20251105151944.png
new file mode 100644 (file)
index 0000000..909aea6
Binary files /dev/null and b/Pasted image 20251105151944.png differ
diff --git a/Pasted image 20251105152000.png b/Pasted image 20251105152000.png
new file mode 100644 (file)
index 0000000..99b4be3
Binary files /dev/null and b/Pasted image 20251105152000.png differ
diff --git a/Pasted image 20251105152032.png b/Pasted image 20251105152032.png
new file mode 100644 (file)
index 0000000..3188a24
Binary files /dev/null and b/Pasted image 20251105152032.png differ
diff --git a/Pasted image 20251105152429.png b/Pasted image 20251105152429.png
new file mode 100644 (file)
index 0000000..a961b98
Binary files /dev/null and b/Pasted image 20251105152429.png differ
diff --git a/Pasted image 20251105152455.png b/Pasted image 20251105152455.png
new file mode 100644 (file)
index 0000000..d9b8613
Binary files /dev/null and b/Pasted image 20251105152455.png differ
diff --git a/Pasted image 20251105152915.png b/Pasted image 20251105152915.png
new file mode 100644 (file)
index 0000000..2c85c1f
Binary files /dev/null and b/Pasted image 20251105152915.png differ
diff --git a/Pasted image 20251105153029.png b/Pasted image 20251105153029.png
new file mode 100644 (file)
index 0000000..6bf9dd5
Binary files /dev/null and b/Pasted image 20251105153029.png differ
diff --git a/Pasted image 20251105153138.png b/Pasted image 20251105153138.png
new file mode 100644 (file)
index 0000000..c3dcbd2
Binary files /dev/null and b/Pasted image 20251105153138.png differ
diff --git a/Pasted image 20251105153529.png b/Pasted image 20251105153529.png
new file mode 100644 (file)
index 0000000..4116359
Binary files /dev/null and b/Pasted image 20251105153529.png differ
diff --git a/Pasted image 20251105153632.png b/Pasted image 20251105153632.png
new file mode 100644 (file)
index 0000000..ffd02fa
Binary files /dev/null and b/Pasted image 20251105153632.png differ
diff --git a/Pasted image 20251105153851.png b/Pasted image 20251105153851.png
new file mode 100644 (file)
index 0000000..9a18473
Binary files /dev/null and b/Pasted image 20251105153851.png differ
diff --git a/Pasted image 20251105154154.png b/Pasted image 20251105154154.png
new file mode 100644 (file)
index 0000000..6e21f23
Binary files /dev/null and b/Pasted image 20251105154154.png differ
diff --git a/Pasted image 20251105164313.png b/Pasted image 20251105164313.png
new file mode 100644 (file)
index 0000000..59c420f
Binary files /dev/null and b/Pasted image 20251105164313.png differ
diff --git a/Pasted image 20251105164449.png b/Pasted image 20251105164449.png
new file mode 100644 (file)
index 0000000..c66c93f
Binary files /dev/null and b/Pasted image 20251105164449.png differ
diff --git a/Pasted image 20251105164828.png b/Pasted image 20251105164828.png
new file mode 100644 (file)
index 0000000..583a5e8
Binary files /dev/null and b/Pasted image 20251105164828.png differ
diff --git a/Pasted image 20251105164842.png b/Pasted image 20251105164842.png
new file mode 100644 (file)
index 0000000..9c379fe
Binary files /dev/null and b/Pasted image 20251105164842.png differ
diff --git a/Pasted image 20251105164856.png b/Pasted image 20251105164856.png
new file mode 100644 (file)
index 0000000..7a9fabc
Binary files /dev/null and b/Pasted image 20251105164856.png differ
diff --git a/Pasted image 20251105165950.png b/Pasted image 20251105165950.png
new file mode 100644 (file)
index 0000000..564d134
Binary files /dev/null and b/Pasted image 20251105165950.png differ
diff --git a/Pasted image 20251105170031.png b/Pasted image 20251105170031.png
new file mode 100644 (file)
index 0000000..6fdb2e5
Binary files /dev/null and b/Pasted image 20251105170031.png differ
diff --git a/Pasted image 20251105170216.png b/Pasted image 20251105170216.png
new file mode 100644 (file)
index 0000000..783f669
Binary files /dev/null and b/Pasted image 20251105170216.png differ
diff --git a/Pasted image 20251105170249.png b/Pasted image 20251105170249.png
new file mode 100644 (file)
index 0000000..118ced9
Binary files /dev/null and b/Pasted image 20251105170249.png differ
diff --git a/Pasted image 20251105174010.png b/Pasted image 20251105174010.png
new file mode 100644 (file)
index 0000000..bfb5996
Binary files /dev/null and b/Pasted image 20251105174010.png differ
diff --git a/Pasted image 20251105174017.png b/Pasted image 20251105174017.png
new file mode 100644 (file)
index 0000000..b3704a9
Binary files /dev/null and b/Pasted image 20251105174017.png differ
diff --git a/Pasted image 20251105174301.png b/Pasted image 20251105174301.png
new file mode 100644 (file)
index 0000000..dcd2be9
Binary files /dev/null and b/Pasted image 20251105174301.png differ
diff --git a/Pasted image 20251105174734.png b/Pasted image 20251105174734.png
new file mode 100644 (file)
index 0000000..d105ccb
Binary files /dev/null and b/Pasted image 20251105174734.png differ
diff --git a/Pasted image 20251105174752.png b/Pasted image 20251105174752.png
new file mode 100644 (file)
index 0000000..3eeb1b6
Binary files /dev/null and b/Pasted image 20251105174752.png differ
diff --git a/Pasted image 20251105175358.png b/Pasted image 20251105175358.png
new file mode 100644 (file)
index 0000000..a77c984
Binary files /dev/null and b/Pasted image 20251105175358.png differ
diff --git a/Pasted image 20251105175413.png b/Pasted image 20251105175413.png
new file mode 100644 (file)
index 0000000..0c25ddd
Binary files /dev/null and b/Pasted image 20251105175413.png differ
diff --git a/Pasted image 20251105175449.png b/Pasted image 20251105175449.png
new file mode 100644 (file)
index 0000000..39d33af
Binary files /dev/null and b/Pasted image 20251105175449.png differ
diff --git a/Pasted image 20251105175506.png b/Pasted image 20251105175506.png
new file mode 100644 (file)
index 0000000..e2f6537
Binary files /dev/null and b/Pasted image 20251105175506.png differ
diff --git a/Pasted image 20251105175851.png b/Pasted image 20251105175851.png
new file mode 100644 (file)
index 0000000..949da53
Binary files /dev/null and b/Pasted image 20251105175851.png differ
diff --git a/Pasted image 20251105175900.png b/Pasted image 20251105175900.png
new file mode 100644 (file)
index 0000000..8427200
Binary files /dev/null and b/Pasted image 20251105175900.png differ
diff --git a/Pasted image 20251105180633.png b/Pasted image 20251105180633.png
new file mode 100644 (file)
index 0000000..19badb3
Binary files /dev/null and b/Pasted image 20251105180633.png differ
diff --git a/Pasted image 20251105180954.png b/Pasted image 20251105180954.png
new file mode 100644 (file)
index 0000000..bf36681
Binary files /dev/null and b/Pasted image 20251105180954.png differ
diff --git a/Pasted image 20251105181214.png b/Pasted image 20251105181214.png
new file mode 100644 (file)
index 0000000..63efb88
Binary files /dev/null and b/Pasted image 20251105181214.png differ
diff --git a/Pasted image 20251105181316.png b/Pasted image 20251105181316.png
new file mode 100644 (file)
index 0000000..9990e6d
Binary files /dev/null and b/Pasted image 20251105181316.png differ
diff --git a/Pasted image 20251105181342.png b/Pasted image 20251105181342.png
new file mode 100644 (file)
index 0000000..20fdbcd
Binary files /dev/null and b/Pasted image 20251105181342.png differ
diff --git a/Pasted image 20251105181354.png b/Pasted image 20251105181354.png
new file mode 100644 (file)
index 0000000..8aad19f
Binary files /dev/null and b/Pasted image 20251105181354.png differ
diff --git a/Pasted image 20251105181429.png b/Pasted image 20251105181429.png
new file mode 100644 (file)
index 0000000..03225db
Binary files /dev/null and b/Pasted image 20251105181429.png differ
diff --git a/Pasted image 20251105181440.png b/Pasted image 20251105181440.png
new file mode 100644 (file)
index 0000000..e73b97b
Binary files /dev/null and b/Pasted image 20251105181440.png differ
diff --git a/Pasted image 20251105181448.png b/Pasted image 20251105181448.png
new file mode 100644 (file)
index 0000000..2c8cf69
Binary files /dev/null and b/Pasted image 20251105181448.png differ
diff --git a/Pasted image 20251105181633.png b/Pasted image 20251105181633.png
new file mode 100644 (file)
index 0000000..5749c03
Binary files /dev/null and b/Pasted image 20251105181633.png differ
diff --git a/Pasted image 20251105181649.png b/Pasted image 20251105181649.png
new file mode 100644 (file)
index 0000000..d563ba5
Binary files /dev/null and b/Pasted image 20251105181649.png differ
diff --git a/Pasted image 20251105181742.png b/Pasted image 20251105181742.png
new file mode 100644 (file)
index 0000000..a07ba18
Binary files /dev/null and b/Pasted image 20251105181742.png differ
index 01de4a655ee046269e2dc6761505eeec37f341ab..ea95c091c7c3bce5c285517d3d9cd85974cccddb 100644 (file)
@@ -841,3 +841,552 @@ conclusions:
 
 - there is a fundamental tradeoff between the bias and the variance of a classifer (depending on how flexible / complex the classifier is)
 - finding the correct regulariser is a 'black art' of ML
+
+# Linear Classifiers
+![[Pasted image 20251105142838.png]]
+
+how to define a classifier / regressor through a loss function and hypothesis class
+"what is linear about a linear model"
+finding the classifier / regressor by minimizing the "loss" on the training set
+Constructing a linear regreression model and logistic classifier by choosing a loss function and hypothesis class
+## risk minimisation
+![[Pasted image 20251105143024.png]]
+two classes with joint probability. How to find a good discriminator?
+Here: rather than modelling the complete distribution P(X,Y), directly model the decision / discriminant function (P(Y|X) or h(X)) or decision boundary
+g(X) is h(X) (in previous references, should be at least)
+
+2 choices:
+- What can classifiers look like?
+- How do we measure how well the classifier works?
+
+$$L(h(X),Y)$$
+X,Y: random variables corresponding to the input features and outcome of interest
+h: hypothesis function that maps input to a decision value
+L: loss / cost function that measures how well the predicted outcome matches the real outcome
+ - sounds easy, enough, given the thing that you are predicting using your h - how well does it do?
+
+## the empirical risk
+$$\min_{h\in H}\frac{1}{N}\sum_{i=1}^NL(h(x_i),y_i)$$
+x,y: observed values corresponding to the input features and outcome of interest
+h: hypothesis function from earlier
+L: loss / cost function
+![[Pasted image 20251105143447.png]]
+
+
+
+![[Pasted image 20251105143524.png]]
+find the function with the smallest average loss. Doesn't seem that hard (in principle)
+
+## hypotheses
+- when the outcome is continuous (regression), h(x) can directly correspond to the function we want to find
+       - -> you are trying to emulate an arbitrary function
+- in classification h(x) is the discriminant function which gives a real-valued output
+- to go from this output to a class decision we still need to set a cut-off, for instance at 0:
+$y=c_1$ if $h(x)>0$
+$y=c_0$ if $h(x)\leq 0$
+(this is also the h(x)=0 we saw on the graph earlier, two images up)
+
+## Hypothesis Class: Linear
+![[Pasted image 20251105143906.png]]
+- why is this called "linear"?
+       - because its "simple"
+       - optimisation is often possible and fast
+       - interpretable
+       - often / sometimes a reasonable appromixiation, locally
+- we have seen linear classifiers before: LDA, nearest mean,
+- Linear hypothesis class: all different w
+
+![[Pasted image 20251105144121.png]]
+TODO: CHECK WHAT THE ANSWER FOR THIS WAS SOMEHOW
+(or at least find the answer to the principle of the question - just based on how it's setup I was would wager that 1 is False and 2 is True, and I guess I could kinda see why 1 is false seeing as it uses a squared term)
+![[Pasted image 20251105144203.png]]
+(there is also this graph)
+
+![[Pasted image 20251105144311.png]]
+(TODO: I don't entirely understand this image)
+
+
+## what is 'linear' about a classifier?
+the decision function (usually) and the decision boundary
+
+Side Note: Multiple w's can result in the same decision boundary
+
+For our choice for a hypothesis class, let's go with all linear functions:
+![[Pasted image 20251105144526.png]]
+(so basically, as I understand it, this models all linear functions, and we are trying to find the one that gives us the lowest loss, is what the entirely of this formula is)
+![[Pasted image 20251105144615.png]]
+(what this explains)
+
+but how do we define a good loss function?
+
+- how do we measure how 'well' the model solves the problem
+- different problems: regression & classification
+- lets start with regression
+
+## linear regression
+![[Pasted image 20251105144731.png]]
+
+to find out which line is better we have to define a loss function:
+- consider the difference between the predicted value and the true value for example TODO: figure out what the \top symbol means again exactly $$\mathbf{x}_i^\top\mathbf{w}+w_0-y_i$$ (sorry yes I started bolding things now that I should have bolded long ago to indicate they are vectors)
+- we can also consider the absolute difference $$|\mathbf{x}_i^\top\mathbf{w}+w_0-y_i|$$
+- or penalise large differences more strongly by taking the squared difference $$(\mathbf{x}_i^\top\mathbf{w}+w_0-y_i)^2$$
+![[Pasted image 20251105151944.png]]
+![[Pasted image 20251105152000.png]]
+
+![[Pasted image 20251105152032.png]]
+(application of the above)
+
+$$\min_{\mathbf{w},w_0\in\mathbb{R}^{D+1}}\frac{1}{N}\sum_{i=1}^N(\mathbf{x}_i^\top\mathbf{w}+w_0-y_i)^2$$
+Cost function: Squared Loss
+Hypothesis Class: All Linear models (thats the part below the min to be clear)
+Minimisation procedure: Gradient Descent (or closed form solution in practice which may be achieved)
+
+$$\min_{\mathbf{w},w_0\in\mathbb{R}^{D+1}}\frac{1}{N}\sum_{i=1}^N(\mathbf{x}_i^\top\mathbf{w}+w_0-y_i)^2=\min_{\mathbf{w,w_0}}J(\mathbf{w},w_0)$$
+How to find the best $\mathbf{w}$?
+- taking the derivative, finding w for which it is 0?
+- trying all possible vals? -> well this basically seems impossible given you want to test all possible functions, but you can probably limit it
+- starting at some w and keep making small changes to improve it? (this is stocastic descent right?)
+
+## Gradient Descent (the thing I meant with Stocastic descent)
+![[Pasted image 20251105152429.png]]
+How do you choose step size and number of steps?
+
+![[Pasted image 20251105152455.png]]
+
+
+### gradient descent procedure
+Given an object function J, learning rate (step size) $\alpha$ and number of iterations T
+1. Pick a starting value (e.g. random) for $\mathbf{w}$ and $w_0$.
+2. For T iterations, for all $j=0,1,...,D$ do:
+$$w_j^{t+1}=w_j^t-\alpha\frac{\partial J(\mathbf{w,w_0})}{\partial w_j}|_{\mathbf{w},w_0=\mathbf{w^t,w_0^t}}$$
+step in the direction of descent essentially, I have an idea of how this works as someone explained it quite well once (i think at least)
+
+![[Pasted image 20251105152915.png]]
+(TODO: unsure how to do this - perhaps check it out yea)
+(as far as I understand this is finding the partial derivative with respect to that point I guess) (TODO: what was w_0 again? the bias no?)
+
+
+![[Pasted image 20251105153029.png]]
+
+## Stochastic Gradient Descent
+![[Pasted image 20251105153138.png]]
+- to calc the full gradient we need to sum over all objects before we take a step
+- instead we could estimate the gradient using one, or a few objects and take a step using this estiamte of the gradient
+- the step is less 'precise', but we can take many more steps in the same amount of time (faster)
+- epoch: visiting all the data once
+- so in stochastic gradient descent we do updates within an epoch while regular gradient descent does only one update per epoch
+(that checks out)
+- what again was the thing minimising? it's trying to find the smallest h? or which one exactly TODO
+
+Other possibilities:
+- fixing the step size is not that smart
+       - second order methods: take curvature into account
+       - line search
+       - momentum
+- many other descent proecdures, for instance, conjugate graident
+- note: using this gradient descent iterative procedure was not necessary here! There is a closed form / analytic solution! For our next model this will not be the case
+
+## DoeS GD lead to a good / the best solution?
+- depends on the cost function and function class: what does the object function look like?
+- for convex functions with teh right settings and the right amount of patience, we can reach teh global min
+- the global min is the classifier with the min value for the cost function which may not be the best solution for the problem (As we define the cost function) (recall: we want teh solution to work well for the expected loss)
+
+![[Pasted image 20251105153529.png]]
+
+(oh wow thanks aspect ratio)
+
+# Linear Discriminative Classifiers
+![[Pasted image 20251105153632.png]]
+(empirical risk minimisation)
+
+## Logistic Regression?
+
+Classification Loss - how do you want to do it?
+- Count the number of mistakes?
+- Nah - accuracy can be difficult to optimise (if you are on a flat surface where do you go?)
+- multiple good solutions, which one to pick
+![[Pasted image 20251105153851.png]]
+Two reasons to use some other loss functions:
+1. there are other objectives in classification: different weight for different kind of mistakes, accuracy probabilities, correct "ordering" of the objects etc.
+2. find a good solution for accuracy by using some other loss quanitification
+
+
+*different goal:*
+- find a function that accurately approximates the prob of a label:
+$$h(X)\approx P(Y=1|X)$$
+(in this case h(X) maps the prox that something is class 1 based on the given data, so it maps a probability)
+probs have to be between [0,1]
+Our hypothesis function h(X) can return any continuous value
+Let's use some function to mpa h(X) to [0,1]
+
+![[Pasted image 20251105154154.png]]
+
+## classification losses: logistic regression
+Assume Y is either 0 or 1.
+Model the class posterior probability $P(Y=1|X)$ using $\sigma(h(X))$
+measure the quality of the model: given a achoice of h(X) what is the prob. we would have observed the labels we observed in the data. -> not just what proportion did you observe, but what is the chance you observed it! important distinction I feel like
+
+N Bernouilli trials: The estimated prob. of observing label $y_i$ for object $i$
+$$
+\begin{cases}
+\sigma(h(\mathbf{x}_i)),&\text{if } y_i=1 \\
+1-\sigma(h(\mathbf{x}_i)),&\text{if } y_i=0
+\end{cases}
+$$
+(all my homies love sigmoids)
+
+Combining all of these, the likelihood is:
+$$L(h)=\prod^N_{i=1}\sigma(h(\mathbf{x}_i))^{y_i}(1-\sigma(h(\mathbf{x}_i)))^{1-y_i}$$
+The likelihood function is a masure of how well the estimated probabilities explain the observed labels, so we want to maximise this likelihood (cinema)
+
+This gives the same optimum as minimizing the negative log likelihood, which we could have done as :
+$$J(h)=-\sum^N_{i=1}y_i\log[\sigma(h(\mathbf{x}_i))]+(1-y_i)\log[1-\sigma(h(\mathbf{x}_i))]$$
+![[Pasted image 20251105164313.png]]
+
+## Logistic Regression: Hypothesis Class
+for linear logistic regression, we again consider all $\mathbf{w}, w_0$ for $h(\mathbf{x})=\mathbf{x}^\top\mathbf{w}+w_0$
+which gives us this wonderful formula:
+![[Pasted image 20251105164449.png]]
+
+yep
+
+(just combination of all the things from before basically but I wasn't gonna write that out right now)
+
+now if we want to make class predictions we have to use a cut-off at a probability, corresponds to a threshhold for h(x).
+(you can find a visualisation here apparently:
+https://jkrijthe.shinyapps.io/lr-optimization/)
+
+Logistic Regression:
+- linear classifier
+- maximise the (log) likelihood of the observed posterior probabilities
+- or minimize the negative (log) likelihood
+- no analytical solution so we use an iterative process like gradient descent
+- if no class overlap our quality measure is not sufficeient: many equally good solutions
+
+## support vector machines (TODO: apparently you have to check books for optimisation details)
+![[Pasted image 20251105164828.png]]
+![[Pasted image 20251105164842.png]]
+![[Pasted image 20251105164856.png]]
+https://jkrijthe.shinyapps.io/ClassroomSVM/
+(another example apparently)
+
+## Defining the support vector class? SVC whatever that is
+We want to classify the object as either the positive or the negative class, which we decide based on the sign of $$\mathbf{x}_i^\top\mathbf{w}+w_0$$
+we want the closest objects to be "away" from the decision boundary, so let's say we want a decision value of at least M
+$$
+\begin{cases}
+\mathbf{x}_i^\top\mathbf{w}+w_0 \geq M&\text{if }y_i=+1\\
+\mathbf{x}_i^\top\mathbf{w}+w_0 \leq -M&\text{if }y_i=-1\\
+\end{cases}
+$$
+Problem: we always have the freedom to scale w to reach M, without changing the decision boundary. Let's choose the scale such that closest object has decision value -1 or 1
+$$
+\begin{cases}
+\mathbf{x}_i^\top\mathbf{w}+w_0\geq1&\text{if }y_i=+1\\
+\mathbf{x}_i^\top\mathbf{w}+w_0\leq-1&\text{if }y_i=-1
+\end{cases}
+$$
+
+what is the distance of the decision boundary to the closest object?
+$$\frac{|\mathbf{x}_i^\top\mathbf{w}+w_0|}{||\mathbf{w}||}=\frac{1}{||\mathbf{w}||}$$
+
+for the points on hte margin we have thus know that their distance to the decision boundary is the above
+
+So the margin is: $$\frac{2}{||\mathbf{w}||}$$
+TODO: please interpret this for my monkey brain because right now it's just math in my head
+
+we wanted to maximise the margin so:$$\max_{\mathbf{w},w_0}\frac{2}{||\mathbf{w}||}$$
+which is the same as minimising: $$\min_{\mathbf{w},w_0}\frac{1}{2}||\mathbf{w}||^2$$
+![[Pasted image 20251105165950.png]]
+(subject to)
+
+![[Pasted image 20251105170031.png]]
+
+Soft-Margin SVM
+$$\min_{\mathbf{w},w_0}\frac{1}{2}||\mathbf{w}||^2+C\sum^N_{i=1}[1-y_i(\mathbf{x}_i^\top\mathbf{w}+w_0)]_+$$
+where the + sign at the end indicates the value of the function if the value is positive, and 0 otherwise
+
+![[Pasted image 20251105170216.png]]
+always remember this
+
+## SVM as empirical risk minimisation
+![[Pasted image 20251105170249.png]]
+
+SVM summary:
+- linearly separable data: maximise the margin
+- the objects on the margin completely determine what the decision boundary looks like (support vectors)
+- removing the other objects will not change the decision boundary
+- SVMs can also be formulated as a risk minimisation problem
+- extensions: non-separable data, non-linear functions
+
+
+## multi-class classification
+i.e. more than 2 classes in our case
+- for generative models: model the extra $P(X|Y=k)$'s
+- Discriminative Models:
+       - two approaches:
+               - algorithm specific adaptions i.e. directly construct a multi-class classifier
+               - general techniques: combing multiple 2-class classifiers
+
+Example for algorithm adaptions:
+Change the algo to return posteriors / decisions values for K-classes rather than 1
+
+e.g. in logistic regression, instead of modelling the probability of positive vs. negative, we can also model the probability of multiple classes:
+
+Two Class:
+$$
+\begin{align}
+p(Y=1|\mathbf{x})=\frac{1}{1+\exp(-\mathbf{x}^\top\mathbf{w})} \\
+p(Y=0|\mathbf{x})=\frac{\exp(-\mathbf{x}^\top\mathbf{w})}{1+\exp(-\mathbf{x}^\top\mathbf{w})}
+\end{align}
+$$
+(ok wait I think the funky T means essentially here that we just want to multiply all teh features by all the weights and it like rotates the matrix essentially no?, at least that would make sense TODO)
+
+Multiple classes:
+$$p(Y=k|\mathbf{x})=\frac
+{\exp(-\mathbf{x}^\top\mathbf{w}_k)}
+{\sum^K_{k=1}\exp(-\mathbf{x}^\top\mathbf{w}_k)}$$
+
+## general approaches
+sometimes the algorithm might be inherently binary, and hard to adapt to multiple classes. Can we still use these binary classifiers to do multiclass classification? - absolutely
+
+Consider:
+- one vs. rest
+- one vs. one
+methods
+![[Pasted image 20251105174010.png]]
+![[Pasted image 20251105174017.png]]
+
+Solving the ambiguity:
+- possible solution to the ambiguity: use the decision values rather than predicted labels of the classifiers
+- ideally we train these classifiers jointly, to optimize performance on the multiclass problem
+- often we can adapt the original model from 2 class to k class as in the logistic regression example
+(i dont entirely understand the ambiguity problem TODO)
+
+- linear logistic regression with logistic loss and gradient descent
+- linear support vecotr machines: max the margin
+- multi-class discriminative classifiers:
+       - algorithm specific adaption
+       - generitc techniques: one-vs-all, one-vs-rest
+
+## responsible machine learning - fairness
+![[Pasted image 20251105174301.png]]
+problematic use of AI
+
+- demographic disparities: sometimes called "biases"
+- not to be confused with bias in the bias-variance tradeoff
+- for example: "amazon offers free-same-day delivery is twice as likely in a neighbourhood with more white residents compared to black residents"
+- is this discrimination?
+
+why do disparities exist?
+- history of discrimination 
+- implicity attitudes about groups of people
+- stereotypes
+- differences in characteristics in groups
+- stereotypes can be self-fulfilling and are hard to get rid of
+- how to avoid them in ML models?
+
+measurement:
+- downloading an excel sheet, csv
+- scientist measuring in a labe
+- using a sensor
+- objective
+yes or no to that?
+
+## measuring difficulties:
+- Measuring race:
+       - social construct; not related to biology
+       - changes over time
+       - different terms in different cultures e.g. African American
+- Checkbox?
+       - multiracial / mixed?
+       - How to measure?
+- Measure stuff about people can subjective, challenging
+
+- machine learners need to make variables, categories or targets
+- Target vars:
+       - 'credit worthiness'
+       - 'good employee'
+       - 'physical attractiveness'
+       - 'criminal risk assessment'
+
+- subjective judgements inherit stereotypes (also unconciously)
+- social constructs
+
+## learning difficulties
+![[Pasted image 20251105174734.png]]
+![[Pasted image 20251105174752.png]]
+
+### an idea:
+- ml application that judges job applications
+- idea: remove the gender
+
+- discrimination still happens - there are other ways in which society can have an influence on men and women that is identifiable and the AI can find this too, and we can't remove all that information because we might just not be left with anything
+
+CelebA dataset, used for facial recognition
+- celebrities from hollywood
+- little minority data
+- result: model underperforms for minorities (too few training samples)
+
+## action difficulties
+- our model predicts who is suitable for a job
+- training data:
+       - 90% of the pop 1 is suitable for the job
+       - 50% of the pop 2 is suitable for the job
+- the model will reproduce this pattern in the data
+       - that can be bad
+
+- automated decisions
+- only the outcomes are important not the process
+- true or false? (false?)
+
+ethical decisions: not only outcome important, but process
+
+why was a decision made? Only if we can understand how the decision is made,w e can understand if it was ethical or not
+
+needs to be explainable / white box
+
+## feedback
+feedback: click on AD, link search result
+feedback to the machine learning system to improve model
+
+- googling a black-sounding name: more ads for arrest records because users click on ads that conform to stereotypes
+
+- is this the feedback we want? clicked because best result or just because top of page?
+
+- feedback can reflect or amplify cultural prejudices
+
+## loops
+- self fulfilling predicitions
+- predictions that affect the training data
+- predictions that affect society
+
+self-fulfilling:
+![[Pasted image 20251105175358.png]]
+classic police example
+
+![[Pasted image 20251105175413.png]]
+
+![[Pasted image 20251105175449.png]]
+
+![[Pasted image 20251105175506.png]]
+
+
+## other considerations:
+- a "fair" learning model might not always be the best solution
+       - maybe try to improve workplace for minorities
+- should we automate and measure everything?
+       - errors with facial recog, DNA foresensics etc.
+
+but you can also be optimistic:
+- Ml can be more accurate than experts
+- ml can be more transparent
+- forces us to articulate our objectives
+- debate and specify: what is fair and trade-offs?
+- more difficult to hide poorly specified or harmful intentions
+
+## measuring fairness and adapting models
+- why sensitive attributes shouldn't be removed
+- measuring fairness using statistics:
+       - criteria 1: indepdendence
+       - criteria 2: separation
+- limitations of statistics
+
+notiation notes:
+$A$ sensitive attribute. Example: $A=a$ (other genders), $A=b$ (male)
+X other features. Example: Word occurences in resume
+$Y$ classification target. $Y=0$ (job rejection), $Y=1$ (accept)
+R classifier output $R=0$ (predict: reject), $R=1$ (predict: accept)
+
+![[Pasted image 20251105175851.png]]
+![[Pasted image 20251105175900.png]]
+![[Pasted image 20251105180633.png]]
+
+crit 1: indepedence
+$$P(R=1|A=a)=P(R=1|A=b)$$ (the chance that either gender is given the ob has to be the same, or at least predicted to be given the job rather)
+
+the acceptance rate for each group should be the same.
+the classification rule does not have to be the same for each group
+
+Independence not satisfied here:
+Group 1: 5/100 hired = 5%
+Group 2: 11/60 hired: 18%
+
+independence satisfied:
+Group 1: 18/100 hired = 18%
+Group 2: 11/60 hired = 18%
+
+Note: need sensitive attribute to compute
+never remove the sensitive attribute from your dataset
+
+## independence approach 2: pre-process
+
+- another way to satisfy independence is to pre-process the data
+- the data should be pre-processed in such a way that it becomes impossible to tell from which group a candidate comes
+
+Problems:
+1. How?
+2. you may lose too much info
+
+![[Pasted image 20251105180954.png]]
+
+Problems with pre-processing:
+- bigger datasets even harder - more time, more variation, more outliers and things you have to take into account
+- sensitive attributes are often more intertwined with all data - may need to remove a lot
+- model performance might suffer a lot
+
+- model input: DNA. Output: predict the income (crazy)
+- What will this model learn?
+- A: DNA -> race -> income
+
+- what do we remove from the DNA?
+- for visual data: what should be removed to obscure gender?
+
+to get independence
+approaches:
+- different model per group
+- preprocess data so that we can't distinguish groups
+
+the problem with independence
+
+![[Pasted image 20251105181214.png]]
+
+Criteria 2: separation
+- instead: fraction of mistakes for each group should be the same
+- e+ and e- should be the same for each group 
+- fraction of positives can now be different per group
+
+![[Pasted image 20251105181316.png]]
+
+
+### ROC CURVE WOOOOOOOOOOOOOOOOOO
+![[Pasted image 20251105181342.png]]
+![[Pasted image 20251105181354.png]]
+
+
+![[Pasted image 20251105181429.png]]
+![[Pasted image 20251105181440.png]]
+
+
+![[Pasted image 20251105181448.png]]
+
+## limitations of stats:
+- criteria only look at the statistics. Problem?
+
+- example: we hire from group A using interview score, we hire from group B randomly
+- example: we hire from group A using the highest interview score, we hire from group B with the lowest interview score
+
+
+Conclusion: for fairness we need to know how a decision was made
+stats: can only judge fairness on a group level
+
+
+![[Pasted image 20251105181633.png]]
+
+![[Pasted image 20251105181649.png]]
+
+# decision trees
+TODO: read lots of book stuff to do here
+![[Pasted image 20251105181742.png]]
+
+