From: 2weiEmu Date: Thu, 6 Nov 2025 00:04:46 +0000 (+0100) Subject: ml notes raw mostly done X-Git-Url: https://git.saalbach.dev/?a=commitdiff_plain;h=5bdbd915d1ab7f3e22c1902b4c36d5da9eafb9bb;p=research-obsidian.git ml notes raw mostly done --- diff --git a/.obsidian/workspace.json b/.obsidian/workspace.json index 1ef08bd..3df61f1 100644 --- a/.obsidian/workspace.json +++ b/.obsidian/workspace.json @@ -23,7 +23,7 @@ "sortOrder": "alphabetical", "showSearch": false, "searchQuery": "", - "backlinkCollapsed": false, + "backlinkCollapsed": true, "unlinkedCollapsed": true } }, @@ -177,17 +177,17 @@ }, "active": "3e78d839de43b5f0", "lastOpenFiles": [ - "Pasted image 20251105181742.png", - "Pasted image 20251105181649.png", - "Pasted image 20251105181633.png", - "Pasted image 20251105181448.png", - "Pasted image 20251105181440.png", - "Pasted image 20251105181429.png", - "Pasted image 20251105181354.png", - "Pasted image 20251105181342.png", - "Pasted image 20251105181316.png", - "Pasted image 20251105181214.png", - "Pasted image 20251105180954.png", + "Pasted image 20251106010304.png", + "Pasted image 20251106010018.png", + "Pasted image 20251106010007.png", + "Pasted image 20251106005956.png", + "Pasted image 20251106005919.png", + "Pasted image 20251106005851.png", + "Pasted image 20251106005803.png", + "Pasted image 20251106005742.png", + "Pasted image 20251106005719.png", + "Pasted image 20251106005655.png", + "Pasted image 20251106005628.png", "University/Machine Learning/Full Notes.md", "Untitled 1.md", "some_ideas.md", diff --git a/Pasted image 20251105220430.png b/Pasted image 20251105220430.png new file mode 100644 index 0000000..10cc9e6 Binary files /dev/null and b/Pasted image 20251105220430.png differ diff --git a/Pasted image 20251105220543.png b/Pasted image 20251105220543.png new file mode 100644 index 0000000..39bac1f Binary files /dev/null and b/Pasted image 20251105220543.png differ diff --git a/Pasted image 20251105220626.png b/Pasted image 20251105220626.png new file mode 100644 index 0000000..9d1d1b7 Binary files /dev/null and b/Pasted image 20251105220626.png differ diff --git a/Pasted image 20251105220649.png b/Pasted image 20251105220649.png new file mode 100644 index 0000000..e2bb3fc Binary files /dev/null and b/Pasted image 20251105220649.png differ diff --git a/Pasted image 20251105220734.png b/Pasted image 20251105220734.png new file mode 100644 index 0000000..a22718f Binary files /dev/null and b/Pasted image 20251105220734.png differ diff --git a/Pasted image 20251105221215.png b/Pasted image 20251105221215.png new file mode 100644 index 0000000..04a8ea0 Binary files /dev/null and b/Pasted image 20251105221215.png differ diff --git a/Pasted image 20251105221710.png b/Pasted image 20251105221710.png new file mode 100644 index 0000000..883db42 Binary files /dev/null and b/Pasted image 20251105221710.png differ diff --git a/Pasted image 20251105221727.png b/Pasted image 20251105221727.png new file mode 100644 index 0000000..fc2c60f Binary files /dev/null and b/Pasted image 20251105221727.png differ diff --git a/Pasted image 20251105221833.png b/Pasted image 20251105221833.png new file mode 100644 index 0000000..8c93069 Binary files /dev/null and b/Pasted image 20251105221833.png differ diff --git a/Pasted image 20251105222032.png b/Pasted image 20251105222032.png new file mode 100644 index 0000000..91cd3c4 Binary files /dev/null and b/Pasted image 20251105222032.png differ diff --git a/Pasted image 20251105222820.png b/Pasted image 20251105222820.png new file mode 100644 index 0000000..4b2a314 Binary files /dev/null and b/Pasted image 20251105222820.png differ diff --git a/Pasted image 20251105224356.png b/Pasted image 20251105224356.png new file mode 100644 index 0000000..537f7a9 Binary files /dev/null and b/Pasted image 20251105224356.png differ diff --git a/Pasted image 20251105225000.png b/Pasted image 20251105225000.png new file mode 100644 index 0000000..57884ac Binary files /dev/null and b/Pasted image 20251105225000.png differ diff --git a/Pasted image 20251105225024.png b/Pasted image 20251105225024.png new file mode 100644 index 0000000..3a0962b Binary files /dev/null and b/Pasted image 20251105225024.png differ diff --git a/Pasted image 20251105225036.png b/Pasted image 20251105225036.png new file mode 100644 index 0000000..9325a7e Binary files /dev/null and b/Pasted image 20251105225036.png differ diff --git a/Pasted image 20251105225616.png b/Pasted image 20251105225616.png new file mode 100644 index 0000000..a1ab6b9 Binary files /dev/null and b/Pasted image 20251105225616.png differ diff --git a/Pasted image 20251105225642.png b/Pasted image 20251105225642.png new file mode 100644 index 0000000..5d87285 Binary files /dev/null and b/Pasted image 20251105225642.png differ diff --git a/Pasted image 20251105230143.png b/Pasted image 20251105230143.png new file mode 100644 index 0000000..0bfc7cd Binary files /dev/null and b/Pasted image 20251105230143.png differ diff --git a/Pasted image 20251105230213.png b/Pasted image 20251105230213.png new file mode 100644 index 0000000..98a89a9 Binary files /dev/null and b/Pasted image 20251105230213.png differ diff --git a/Pasted image 20251105230237.png b/Pasted image 20251105230237.png new file mode 100644 index 0000000..a1d702b Binary files /dev/null and b/Pasted image 20251105230237.png differ diff --git a/Pasted image 20251105230254.png b/Pasted image 20251105230254.png new file mode 100644 index 0000000..21a01d7 Binary files /dev/null and b/Pasted image 20251105230254.png differ diff --git a/Pasted image 20251105230314.png b/Pasted image 20251105230314.png new file mode 100644 index 0000000..3ec9b54 Binary files /dev/null and b/Pasted image 20251105230314.png differ diff --git a/Pasted image 20251105230333.png b/Pasted image 20251105230333.png new file mode 100644 index 0000000..4bb1744 Binary files /dev/null and b/Pasted image 20251105230333.png differ diff --git a/Pasted image 20251105230342.png b/Pasted image 20251105230342.png new file mode 100644 index 0000000..e533364 Binary files /dev/null and b/Pasted image 20251105230342.png differ diff --git a/Pasted image 20251105230519.png b/Pasted image 20251105230519.png new file mode 100644 index 0000000..0c141c5 Binary files /dev/null and b/Pasted image 20251105230519.png differ diff --git a/Pasted image 20251105230529.png b/Pasted image 20251105230529.png new file mode 100644 index 0000000..662a639 Binary files /dev/null and b/Pasted image 20251105230529.png differ diff --git a/Pasted image 20251105230600.png b/Pasted image 20251105230600.png new file mode 100644 index 0000000..6460c14 Binary files /dev/null and b/Pasted image 20251105230600.png differ diff --git a/Pasted image 20251105230636.png b/Pasted image 20251105230636.png new file mode 100644 index 0000000..d9ac86a Binary files /dev/null and b/Pasted image 20251105230636.png differ diff --git a/Pasted image 20251105230652.png b/Pasted image 20251105230652.png new file mode 100644 index 0000000..8fb990b Binary files /dev/null and b/Pasted image 20251105230652.png differ diff --git a/Pasted image 20251105232759.png b/Pasted image 20251105232759.png new file mode 100644 index 0000000..ad54435 Binary files /dev/null and b/Pasted image 20251105232759.png differ diff --git a/Pasted image 20251105233229.png b/Pasted image 20251105233229.png new file mode 100644 index 0000000..10e62a6 Binary files /dev/null and b/Pasted image 20251105233229.png differ diff --git a/Pasted image 20251105233246.png b/Pasted image 20251105233246.png new file mode 100644 index 0000000..4e5f0a2 Binary files /dev/null and b/Pasted image 20251105233246.png differ diff --git a/Pasted image 20251105233330.png b/Pasted image 20251105233330.png new file mode 100644 index 0000000..cd3505f Binary files /dev/null and b/Pasted image 20251105233330.png differ diff --git a/Pasted image 20251105233501.png b/Pasted image 20251105233501.png new file mode 100644 index 0000000..e675450 Binary files /dev/null and b/Pasted image 20251105233501.png differ diff --git a/Pasted image 20251105233510.png b/Pasted image 20251105233510.png new file mode 100644 index 0000000..9e23859 Binary files /dev/null and b/Pasted image 20251105233510.png differ diff --git a/Pasted image 20251105233554.png b/Pasted image 20251105233554.png new file mode 100644 index 0000000..31328cf Binary files /dev/null and b/Pasted image 20251105233554.png differ diff --git a/Pasted image 20251105233604.png b/Pasted image 20251105233604.png new file mode 100644 index 0000000..d91482c Binary files /dev/null and b/Pasted image 20251105233604.png differ diff --git a/Pasted image 20251105233620.png b/Pasted image 20251105233620.png new file mode 100644 index 0000000..2c376ea Binary files /dev/null and b/Pasted image 20251105233620.png differ diff --git a/Pasted image 20251105233825.png b/Pasted image 20251105233825.png new file mode 100644 index 0000000..76cb06a Binary files /dev/null and b/Pasted image 20251105233825.png differ diff --git a/Pasted image 20251105233857.png b/Pasted image 20251105233857.png new file mode 100644 index 0000000..b854b44 Binary files /dev/null and b/Pasted image 20251105233857.png differ diff --git a/Pasted image 20251105233915.png b/Pasted image 20251105233915.png new file mode 100644 index 0000000..6e199f5 Binary files /dev/null and b/Pasted image 20251105233915.png differ diff --git a/Pasted image 20251105233924.png b/Pasted image 20251105233924.png new file mode 100644 index 0000000..221fe8c Binary files /dev/null and b/Pasted image 20251105233924.png differ diff --git a/Pasted image 20251105233943.png b/Pasted image 20251105233943.png new file mode 100644 index 0000000..964fcfb Binary files /dev/null and b/Pasted image 20251105233943.png differ diff --git a/Pasted image 20251105234401.png b/Pasted image 20251105234401.png new file mode 100644 index 0000000..595268d Binary files /dev/null and b/Pasted image 20251105234401.png differ diff --git a/Pasted image 20251105234436.png b/Pasted image 20251105234436.png new file mode 100644 index 0000000..09db393 Binary files /dev/null and b/Pasted image 20251105234436.png differ diff --git a/Pasted image 20251105234455.png b/Pasted image 20251105234455.png new file mode 100644 index 0000000..43bc919 Binary files /dev/null and b/Pasted image 20251105234455.png differ diff --git a/Pasted image 20251105234627.png b/Pasted image 20251105234627.png new file mode 100644 index 0000000..7f292c6 Binary files /dev/null and b/Pasted image 20251105234627.png differ diff --git a/Pasted image 20251105234949.png b/Pasted image 20251105234949.png new file mode 100644 index 0000000..3f68224 Binary files /dev/null and b/Pasted image 20251105234949.png differ diff --git a/Pasted image 20251105235122.png b/Pasted image 20251105235122.png new file mode 100644 index 0000000..2b5543c Binary files /dev/null and b/Pasted image 20251105235122.png differ diff --git a/Pasted image 20251106000826.png b/Pasted image 20251106000826.png new file mode 100644 index 0000000..5aeac76 Binary files /dev/null and b/Pasted image 20251106000826.png differ diff --git a/Pasted image 20251106000833.png b/Pasted image 20251106000833.png new file mode 100644 index 0000000..23ea9ab Binary files /dev/null and b/Pasted image 20251106000833.png differ diff --git a/Pasted image 20251106000843.png b/Pasted image 20251106000843.png new file mode 100644 index 0000000..4527a42 Binary files /dev/null and b/Pasted image 20251106000843.png differ diff --git a/Pasted image 20251106000902.png b/Pasted image 20251106000902.png new file mode 100644 index 0000000..2e18907 Binary files /dev/null and b/Pasted image 20251106000902.png differ diff --git a/Pasted image 20251106000909.png b/Pasted image 20251106000909.png new file mode 100644 index 0000000..6fbdce9 Binary files /dev/null and b/Pasted image 20251106000909.png differ diff --git a/Pasted image 20251106001045.png b/Pasted image 20251106001045.png new file mode 100644 index 0000000..776fd9a Binary files /dev/null and b/Pasted image 20251106001045.png differ diff --git a/Pasted image 20251106001107.png b/Pasted image 20251106001107.png new file mode 100644 index 0000000..f178f0d Binary files /dev/null and b/Pasted image 20251106001107.png differ diff --git a/Pasted image 20251106001219.png b/Pasted image 20251106001219.png new file mode 100644 index 0000000..8c5173e Binary files /dev/null and b/Pasted image 20251106001219.png differ diff --git a/Pasted image 20251106001228.png b/Pasted image 20251106001228.png new file mode 100644 index 0000000..0f1db49 Binary files /dev/null and b/Pasted image 20251106001228.png differ diff --git a/Pasted image 20251106001536.png b/Pasted image 20251106001536.png new file mode 100644 index 0000000..2d3cc2d Binary files /dev/null and b/Pasted image 20251106001536.png differ diff --git a/Pasted image 20251106002203.png b/Pasted image 20251106002203.png new file mode 100644 index 0000000..eb6e22d Binary files /dev/null and b/Pasted image 20251106002203.png differ diff --git a/Pasted image 20251106002214.png b/Pasted image 20251106002214.png new file mode 100644 index 0000000..8c30899 Binary files /dev/null and b/Pasted image 20251106002214.png differ diff --git a/Pasted image 20251106002222.png b/Pasted image 20251106002222.png new file mode 100644 index 0000000..36e0cc1 Binary files /dev/null and b/Pasted image 20251106002222.png differ diff --git a/Pasted image 20251106002229.png b/Pasted image 20251106002229.png new file mode 100644 index 0000000..ff48585 Binary files /dev/null and b/Pasted image 20251106002229.png differ diff --git a/Pasted image 20251106002243.png b/Pasted image 20251106002243.png new file mode 100644 index 0000000..1762bb7 Binary files /dev/null and b/Pasted image 20251106002243.png differ diff --git a/Pasted image 20251106002252.png b/Pasted image 20251106002252.png new file mode 100644 index 0000000..add8263 Binary files /dev/null and b/Pasted image 20251106002252.png differ diff --git a/Pasted image 20251106002259.png b/Pasted image 20251106002259.png new file mode 100644 index 0000000..2bfc6f1 Binary files /dev/null and b/Pasted image 20251106002259.png differ diff --git a/Pasted image 20251106002306.png b/Pasted image 20251106002306.png new file mode 100644 index 0000000..148c8cc Binary files /dev/null and b/Pasted image 20251106002306.png differ diff --git a/Pasted image 20251106002314.png b/Pasted image 20251106002314.png new file mode 100644 index 0000000..9df7a43 Binary files /dev/null and b/Pasted image 20251106002314.png differ diff --git a/Pasted image 20251106002321.png b/Pasted image 20251106002321.png new file mode 100644 index 0000000..b0cd352 Binary files /dev/null and b/Pasted image 20251106002321.png differ diff --git a/Pasted image 20251106002344.png b/Pasted image 20251106002344.png new file mode 100644 index 0000000..2fbb3e2 Binary files /dev/null and b/Pasted image 20251106002344.png differ diff --git a/Pasted image 20251106002418.png b/Pasted image 20251106002418.png new file mode 100644 index 0000000..f2486a9 Binary files /dev/null and b/Pasted image 20251106002418.png differ diff --git a/Pasted image 20251106002426.png b/Pasted image 20251106002426.png new file mode 100644 index 0000000..3f8632e Binary files /dev/null and b/Pasted image 20251106002426.png differ diff --git a/Pasted image 20251106002435.png b/Pasted image 20251106002435.png new file mode 100644 index 0000000..18d318d Binary files /dev/null and b/Pasted image 20251106002435.png differ diff --git a/Pasted image 20251106002443.png b/Pasted image 20251106002443.png new file mode 100644 index 0000000..eb56cb8 Binary files /dev/null and b/Pasted image 20251106002443.png differ diff --git a/Pasted image 20251106002901.png b/Pasted image 20251106002901.png new file mode 100644 index 0000000..bafb028 Binary files /dev/null and b/Pasted image 20251106002901.png differ diff --git a/Pasted image 20251106003018.png b/Pasted image 20251106003018.png new file mode 100644 index 0000000..71b8606 Binary files /dev/null and b/Pasted image 20251106003018.png differ diff --git a/Pasted image 20251106003829.png b/Pasted image 20251106003829.png new file mode 100644 index 0000000..48b06a1 Binary files /dev/null and b/Pasted image 20251106003829.png differ diff --git a/Pasted image 20251106003852.png b/Pasted image 20251106003852.png new file mode 100644 index 0000000..c212d56 Binary files /dev/null and b/Pasted image 20251106003852.png differ diff --git a/Pasted image 20251106003900.png b/Pasted image 20251106003900.png new file mode 100644 index 0000000..9081bc4 Binary files /dev/null and b/Pasted image 20251106003900.png differ diff --git a/Pasted image 20251106004052.png b/Pasted image 20251106004052.png new file mode 100644 index 0000000..8641fd7 Binary files /dev/null and b/Pasted image 20251106004052.png differ diff --git a/Pasted image 20251106004149.png b/Pasted image 20251106004149.png new file mode 100644 index 0000000..2010dd2 Binary files /dev/null and b/Pasted image 20251106004149.png differ diff --git a/Pasted image 20251106004202.png b/Pasted image 20251106004202.png new file mode 100644 index 0000000..9a5111f Binary files /dev/null and b/Pasted image 20251106004202.png differ diff --git a/Pasted image 20251106004404.png b/Pasted image 20251106004404.png new file mode 100644 index 0000000..c42ed42 Binary files /dev/null and b/Pasted image 20251106004404.png differ diff --git a/Pasted image 20251106004416.png b/Pasted image 20251106004416.png new file mode 100644 index 0000000..061f0ab Binary files /dev/null and b/Pasted image 20251106004416.png differ diff --git a/Pasted image 20251106004428.png b/Pasted image 20251106004428.png new file mode 100644 index 0000000..dafd90c Binary files /dev/null and b/Pasted image 20251106004428.png differ diff --git a/Pasted image 20251106004446.png b/Pasted image 20251106004446.png new file mode 100644 index 0000000..c828642 Binary files /dev/null and b/Pasted image 20251106004446.png differ diff --git a/Pasted image 20251106004843.png b/Pasted image 20251106004843.png new file mode 100644 index 0000000..4d73527 Binary files /dev/null and b/Pasted image 20251106004843.png differ diff --git a/Pasted image 20251106005003.png b/Pasted image 20251106005003.png new file mode 100644 index 0000000..e8666ba Binary files /dev/null and b/Pasted image 20251106005003.png differ diff --git a/Pasted image 20251106005249.png b/Pasted image 20251106005249.png new file mode 100644 index 0000000..606db91 Binary files /dev/null and b/Pasted image 20251106005249.png differ diff --git a/Pasted image 20251106005353.png b/Pasted image 20251106005353.png new file mode 100644 index 0000000..8c19249 Binary files /dev/null and b/Pasted image 20251106005353.png differ diff --git a/Pasted image 20251106005526.png b/Pasted image 20251106005526.png new file mode 100644 index 0000000..c531057 Binary files /dev/null and b/Pasted image 20251106005526.png differ diff --git a/Pasted image 20251106005628.png b/Pasted image 20251106005628.png new file mode 100644 index 0000000..7655d60 Binary files /dev/null and b/Pasted image 20251106005628.png differ diff --git a/Pasted image 20251106005655.png b/Pasted image 20251106005655.png new file mode 100644 index 0000000..ac107cd Binary files /dev/null and b/Pasted image 20251106005655.png differ diff --git a/Pasted image 20251106005719.png b/Pasted image 20251106005719.png new file mode 100644 index 0000000..ef6e134 Binary files /dev/null and b/Pasted image 20251106005719.png differ diff --git a/Pasted image 20251106005742.png b/Pasted image 20251106005742.png new file mode 100644 index 0000000..109a42c Binary files /dev/null and b/Pasted image 20251106005742.png differ diff --git a/Pasted image 20251106005803.png b/Pasted image 20251106005803.png new file mode 100644 index 0000000..138f4a4 Binary files /dev/null and b/Pasted image 20251106005803.png differ diff --git a/Pasted image 20251106005851.png b/Pasted image 20251106005851.png new file mode 100644 index 0000000..ee87c0a Binary files /dev/null and b/Pasted image 20251106005851.png differ diff --git a/Pasted image 20251106005919.png b/Pasted image 20251106005919.png new file mode 100644 index 0000000..cc305de Binary files /dev/null and b/Pasted image 20251106005919.png differ diff --git a/Pasted image 20251106005956.png b/Pasted image 20251106005956.png new file mode 100644 index 0000000..9dd80e0 Binary files /dev/null and b/Pasted image 20251106005956.png differ diff --git a/Pasted image 20251106010007.png b/Pasted image 20251106010007.png new file mode 100644 index 0000000..c2c74d1 Binary files /dev/null and b/Pasted image 20251106010007.png differ diff --git a/Pasted image 20251106010018.png b/Pasted image 20251106010018.png new file mode 100644 index 0000000..7283b16 Binary files /dev/null and b/Pasted image 20251106010018.png differ diff --git a/Pasted image 20251106010304.png b/Pasted image 20251106010304.png new file mode 100644 index 0000000..0f40392 Binary files /dev/null and b/Pasted image 20251106010304.png differ diff --git a/University/Machine Learning/Full Notes.md b/University/Machine Learning/Full Notes.md index ea95c09..62f85b2 100644 --- a/University/Machine Learning/Full Notes.md +++ b/University/Machine Learning/Full Notes.md @@ -1390,3 +1390,800 @@ TODO: read lots of book stuff to do here ![[Pasted image 20251105181742.png]] +![[Pasted image 20251105220430.png]] +- what functions are described by decision trees +- how do we make predicitions using a given tree +- how do we learn a tree from data +- what are the advantages & disadvantages of this method? + +![[Pasted image 20251105220543.png]] + +![[Pasted image 20251105220626.png]] +(too lazy to make mermaid diagram here) + +![[Pasted image 20251105220649.png]] + +- split the feature space using one feature at a time, recursively +- the splitting + - forms a tree structure + - partitions the space in "rectangles" +- in each rectangle / leaf, we assign a value + + +![[Pasted image 20251105220734.png]] + +Tree Learning is made of 2 parts +1. partitioning the feature space / learning the structure of the tree +2. Assigning values to each part (first this) + +## Learning: value Assignment +Given 0-1 loss and the given partioning, what is the optimal value $c_l$ to assign to each leaf $l$. +$$h(\mathbf{x})=\sum_{l\in\text{Leaves}}c_l\mathbf{I}(\mathbf{x}\in l)$$ +what is the $\mathbf{I}$ function here? TODO + +$$\min_{c_l}\sum^N_{i=1}\mathbf{I}(h(\mathbf{x_i})\neq y_i)$$ +(answer: the majority label of all the objects assigned to the leaf is the optimal choice) + +![[Pasted image 20251105221215.png]] + +## learning: splitting +- if all objects in a node belong to one class: we are done, it's a leaf +- If not: find a node-variable combination that increases the quality of the tree the most if we split the node further +- how do you measure quality? + - reduction in error before / after the split? + - not sensitive enough? + +## measure of improvement +S = set of objects in the node +F = feature that we are considering +$|S_V|$ = number of objects in the node with value $V$ + +$$I(S)-\sum_{V\in\text{Values}(F)}\frac{|S_V|}{|S|}I(S_V)$$ +If I(.) is the misclassification error in a set, this measures how much the 0-1 loss will decrease if we split the node into multiple nodes using feature $F$ +other ways to measure impurity of a node I(.)? + +Splitting Criteria: Choosing I(S) +Missclassification: +$$1-\max_c p_c$$ + +Entropy: +$$-\sum_c p_c\log(p_c)$$ + +Gini Index: +$$\sum_c p_c(1-p_c)$$ + +![[Pasted image 20251105221710.png]] +![[Pasted image 20251105221727.png]] +TODO (waht the fuck do these graphs mean) + +![[Pasted image 20251105221833.png]] + +## information gain + +$$I(S)-\sum_{V\in\text{Values}(F)}\frac{|S_V|}{|S|}I(S_V)$$ with $$I(S)=-\sum_c p_c\log(p_c)$$ (which is entropy) + +![[Pasted image 20251105222032.png]] + +## splitting criteria: gain ratio +- the information gain is biased towards features with many discrete values (that checks out, when you have just a bunch of categories essentially, you would also be really eager to split on those a lot) +- we may overfit on such features (for instance, someone's social security number) +- one idea by Quinlan is to correct for this using the entropy of the feature, the intrinsic value (IV) +- actually some (many) implementations only use binary splits +$$IG(S)=I(S)-\sum_{V\in\text{Values}(F)}\frac{|S_V|}{|S|}I(S_V)$$ +$$IV(S)=-\sum_{V\in\text{Values}(F)}\frac{|S_V|}{|S|}\log\frac{|S_V|}{|S|}$$ +$$\text{GainRatio}(S)=\frac{IG(S)}{IV(S)}$$ + +Splitting: continuous variables + +extra challenge: we need to find a threshold for the split +consider all thresholds and pick the optimal one +how many thresholds do we need to consider? + +Learning: Stopping +- if we try to fit pure trees (ones where all the nodes are really just one single thing) - we quickly start overfitting, which makes sense because we also try to fit the outliars really hard + +- criteria: + - min node size + - max tree depth + - min information gain + +- alternative / complementary strat: + - grow and prune + + +## learning: pruning +$$C_\alpha(\text{Tree})=\sum_{l\in\text{Leaves}}|S_l|I(S_l)+\alpha|\text{Leaves}|$$ +Leaves: Leaves in the tree +for each alpha, optimise the tree by removing subtrees +>select optimal alpha using a validation set! + +![[Pasted image 20251105222820.png]] + +## generalisations: + +- Multi-Class: + - take the sum over all classes in the entropy term (should be easy enough) +- Regression + - Squared loss as the loss function + - Leaf value: average outcome + - split criterion: min the variance + - Covered in ProbStat course + +Summary: +- non-linear classifier +- choices in learning: + - splitting criterion (information gain, gain ratio, entropy / gini index, etc.) + - stopping (min gain, node size, max depth) + - pruning (remove parts based on performance on validation set) + +## pros and cons + +pros: +- "interpretable" +- automatic feature selection +- easy to incorporate discrete features and missing values +- fast + +cons: +- unstable +- cannot model linear relationships efficiently +- "greedy" + + +## classifier combining: +- multiple classifiers that each make a prediction: perhaps they make different kinds of mistakes +- how do we combine these predictions? +- does it help to combine them? + +![[Pasted image 20251105224356.png]] + +### condorcet's Jury Theorem + +*Question* + When does it make sense to combine decisions? + +*Theorem* + if p > 0.5, adding voters to the jury increases probability of correct majority vote (assuming voting is independent) + +## Combining Strategies +- Fixed Rules + - Hard rules e.g. majority voting + - soft rules: e.g. mean, product $$ +\begin{align} +\max_y\sum_{j=1}^C\frac{1}{C}h_j(y|x)\\ +\max_y\prod_{j=1}^C h_j(y|x) \\ +\min_{p(y|\mathbf{x})}\sum_jD(h_j(y|\mathbf{x}),p(y|\mathbf{x})) +\end{align} +$$ + +- learned rules + - learn a classifier to output a decision based on the outputs o a set of base classifiers + + +### what to combine +- where do multiple classifiers come from? + - different classifiers on the same dataset? + - the same classifier on different datasets? +- Diversity may be helpful + - let's consider an exmaple where we construct "diverse" classifiers using different versions of the same dataset + +## random forests + +- single tree might easily overfit and might be unstable +- combine multiple trees +- should be sufficiently "different" + 1. randomly s elect features + 2. randomly select samples (with replacement) + +random forest: construct a large number of trees using randomly selected objects and features and come their decisions + +## bagging (bootstrap aggregation) +![[Pasted image 20251105225000.png]] + +## random subspaces +![[Pasted image 20251105225024.png]] + +## random forest +![[Pasted image 20251105225036.png]] + +## parameters +Trees: +- depth +- number of leaves +- min number of objects per node +- information criterion +- pruning + +Forest: +- number of trees +- size of subspaces considered + +Pros: +- flexible / low bias +- works for many different types of data +- embarassingly parallell (one hell of an adjective here by the slides) +- produces out-of-bag (OOB) estimates +- (scale) invariant +- (relatively) few hyperparameters + +Cons: +- harder to interpret than single trees +- computationally expensive + + +- decision trees: a greedy space partitioning approach to non-linear classification that leads to an "intuitive" classifier, but typically has high variance +- classifier combining can lead to better predictions by combining multiple diverse classifiers +- random forests are a combination of decisions trees constructed u sing random subsampling of objects and features + +## multi-layer perceptrons +(non-linear discriminative classifiers) + +## today: the common learning algorithm used in ML today +- multi-layer perceptrons / artifical neural networks / feed-forward "deep" neural networks +- connected topics: + - logistic regression + - gradient descent + - classifier combining + +![[Pasted image 20251105225616.png]] + +![[Pasted image 20251105225642.png]] +we sure love this + +## perceptron math +$$\sum^N_{i=1}\max(-y_i(\mathbf{w^\top}\mathbf{x}_i+w_0),0)$$ +Loss: perceptron loss +function class: linear +optimiser: (stochastic) gradient descent + +## perceptron training +$$\frac{\partial J(\mathbf{w})}{\partial\mathbf{w}}=\sum_{y_i(\mathbf{w^T\mathbf{x}_i})<0}-y_i\mathbf{x}_i$$ +$$\mathbf{w}^{t+1}=\mathbf{w}^t+\alpha_t\sum_{y_i(\mathbf{w^{t^T}\mathbf{x}_i})<0}y_i\mathbf{x}_i$$ +guaranteed to converge if: +1. problem is linearly separable +2. we choose the right, decreasing, learning rate + +## logistic regression +![[Pasted image 20251105230143.png]] + + +![[Pasted image 20251105230213.png]] + +## multi-layer perceptrons +![[Pasted image 20251105230237.png]] +![[Pasted image 20251105230254.png]] +![[Pasted image 20251105230314.png]] + +![[Pasted image 20251105230333.png]] + + +![[Pasted image 20251105230342.png]] + +- consufing naming: MLP as general term +- other activation functions: + - sigmoid (logistic (we had this one earlier)) + - Tanh + - Modern alternatives: ReLU and its variants + +![[Pasted image 20251105230519.png]] + + +![[Pasted image 20251105230529.png]] + + +## prediction: forward pass +Given a model, we can do prediction / inference in a *forward pass* +![[Pasted image 20251105230600.png]] + +## objective function (still J) +![[Pasted image 20251105230636.png]] +(is the objective function in this case, with one hidden layer) +![[Pasted image 20251105230652.png]] + +## why learning is hard +- if the model output is wrong, how shuoold we update the model? which part of the error should we attribute to the different weights? +![[Pasted image 20251105232759.png]] + +## learning the MLP weights +- remember the general setup: we are again going to use gradient descent so we need the gradient +- intuitively, what the gradient will do is propagate the error at teh end of the network, back across the graph to tell us how to change the weights +- this clever way of efficiently calculating the gradient is called "back propagation" + +$$\mathbf{w}^{t+1}=\mathbf{w}^t-\alpha_t\nabla_{\mathbf{w}}J(\mathbf{w}^t)$$ +the inverted delta is a "Nabla" +$$\nabla_\mathbf{w}J(\mathbf{w})= +\begin{bmatrix} +\frac{\partial J(\mathbf{w})}{\partial w_1} \\ +... \\ +\frac{\partial J(\mathbf{w})}{\partial w_n} +\end{bmatrix}$$ +## backpropagation +![[Pasted image 20251105233229.png]] +![[Pasted image 20251105233246.png]] + +it is important you understand the intuition here: why this is called back-propagation and how it relates to the gradient + + +![[Pasted image 20251105233330.png]] +(TODO: yo alright you lost me here ngl, im gonna have to pick this one apart) + +![[Pasted image 20251105233501.png]] +![[Pasted image 20251105233510.png]] + +## learning the MLP weights +![[Pasted image 20251105233554.png]] +![[Pasted image 20251105233604.png]] + +![[Pasted image 20251105233620.png]] + +there are many variations: +- activation functions and how to combine them +- how many hidden layers? +- how to connect different neurons, share parameters etc. +- intialisation, optimisation procedure, regularisation, weight decay ... +- these combinations lead to the models that are used today: + - Convolutional neural networks + - residual networks + - transformers + +## challenges +- non-convex risk function: may get stuck in local optima, converge slowly etc. +- many architectural choices: which is optimal? +- flexible model: risk of overfitting +- there is an *art* to getting networks to work well +- difficult to interpret the model (black box) +- with many parameters: computationally demanding (but can be partially addressed with hardware) + +![[Pasted image 20251105233825.png]] + +![[Pasted image 20251105233857.png]] + +![[Pasted image 20251105233915.png]] + +![[Pasted image 20251105233924.png]] + +![[Pasted image 20251105233943.png]] + +Pros and cons: + +Pros: +- flexible model class +- good empirical performance on many (structured) problems +- can be easily adapted to different learning settings + +Cons: +- computationally really expensive +- lots of hyper-parameters +- does not converge to a unique optimum +- optimizing them can be an art +- hard to interpret + +recap: +- perceptrons are simple linear binary classifiers +- multi-layer perceptions are connected architectures of perceptron-like nodes, inspired by a simplfiied model of the brain +- they are trained using (stochastic) gradient descent, efficiently calculating the graident using back propagation + +## unsupervised learning +until now we have been using supervised methods +(cinema) +- each training example described by a feature vector and a label + +unlabelled data: what now? +- unsupervised learning: no labels / targets present + +What can you do? +- Clustering + - discover structures in unlabelled data +- Dimensionality Reduction + - does not use information about the labels + +## clustering +- explain what clustering is an its applications +- explain k-,means algorithm +- explain hierarchical clustering, single and complete link +- rpos and cons of k-means and hierarchical clustering +- implement k-means + + +![[Pasted image 20251105234401.png]] + +clusteriing is about finding natural groups in the data where +- items within the group are close together +- items between groups are far apart + +subjective: for one a pattern is interesting, for another boring + +![[Pasted image 20251105234436.png]] + +![[Pasted image 20251105234455.png]] + +## clustering applications: +- market research: find groups of similar customers +- social networks: find communities with similar interests / characteristics +- recommender systems: find groups of users with similar ratings + +## what do we need for clustering? +1. proximity measure either + 1. similarity measure $s(x_i, x_k)$: large if x_i and x_k are similar, or + 2. dissimilarity (distance) measure, small if they are similar +2. Criterion function to evaluate a clustering + 1. ![[Pasted image 20251105234627.png]] +3. algorithm to compute clustering + 1. e.g. by optimizing the criterion function + +## distance measure +- typically we need to define a distance between objects first: +- euclid: $$d(\mathbf{x},\mathbf{y})=\sqrt{\sum^l_{i=1}(x_i-y_i)^2}$$ Manhattan: +$$d(\mathbf{x},\mathbf{y})=\sum^l_{i=1}|x_i-y_i|$$ +even more: +- Cosine similarity: $$s_{\text{cos}}(\mathbf{x},\mathbf{y})=\frac{\mathbf{x}^T\mathbf{y}}{||\mathbf{x}||||\mathbf{y}||}$$ +- Pearson's correlation coefficient: +![[Pasted image 20251105234949.png]] + +and more for all kinds of features like discrete, mixed and categorical + +## cluster eval. (hard)] +- no agreed upon performance measure, many proposed +- intra-cluster cohesion (compactness) + - cohesion measures how near the data points in a cluster are to the cluster's mean + - sum of sqaured errors (SSE) is a commonly used measure +- inter-cluster separation (isolation): + - separation means that different cluster means should be far away from one another +- in most applications, experts judgments are still the key + +![[Pasted image 20251105235122.png]] + +## Hard vs. Soft +Hard Assignments: each point assigned to 1 cluster +- K-means +- Hierarchical clustering + +- Soft Assignemnts: each point assigned cluster membership + - fuzzy c-means + - probabilistic mixture models +(soft is not covered for us) + + +## k-means clustering +K-means algorithm +- let the set of $n$ data points be ${x_1, x_2,...x_n}$ (they are all vectors) +- x_i is a feature vector okay +- P is number of dimensions + +- the k-means algorithm partitions the given data into k-clusters: + - each cluster has a cluster centre (cluser mean), called a centroid + - K is specified by the user + +Given k, the k-means algorithm works as follows: +1. choose random k random data points (seeds) to be initial centroids, cluster centers +2. Assign each data point to the closest centroid +3. re-compute the centroids using the current cluster memberships +4. If a convergence criterion is not met, repeat steps 2 and 3 + +K-means how it works: +![[Pasted image 20251106000826.png]] +![[Pasted image 20251106000833.png]] +![[Pasted image 20251106000843.png]] +![[Pasted image 20251106000902.png]] +![[Pasted image 20251106000909.png]] + +- when do we know when to stop? +- what is it trying to optimize? +- how do we choose the number of centers? +- are we sure it will terminate? +- are we sure it will find an optimal clustering? + +## K-means convergence (stopping) criterion: +- no (or min) reassignments of data points to different clusters +- no (or min) change of c entroids or +- min decrease in the sum of squared errors (SSE) + +Sum of Squared Errors: +![[Pasted image 20251106001045.png]] +![[Pasted image 20251106001107.png]] +DISTORTION IS THE COST FUNCTION + +## random initalisation +$2\leq k < m$ +- random pick K training examples +- set $\mu_1, \mu_2,...,\mu_k$ equal to these examples +lcoal optima: +![[Pasted image 20251106001219.png]] +![[Pasted image 20251106001228.png]] + +k-means summary: +cons: +- finds only convex clusters ("round shapes") +- sensitive to initialisation +- can get stuck in local minima + +pros: +- very simple +- fast + +## hierarchical clustering +- selecting k is a problem of granularity + - how course or fine-grained is the structure in the data? + - no cluster algorithm able to pick k +- instead of picking k find a hierarchy of structure + - coarse (is what they meant earlier as well): top level contains all points + - fine grained: bottom level one cluster per data point + +![[Pasted image 20251106001536.png]] + +## hierarchical clustering approaches +- agglomerative (bottom-up) + - each point starts as cluster + - group two closest clusters + - stop at some point + +- divisive (top-down) + - all points start in one cluster + - split cluster in some "Sensible way" + - stop at some point +(man i love these descriptions this is truly cinema) + +Divisive: hierachical k-means +- apply k-means recurisvely: + - run k-mena on the original data for k=2 + - for each of the resulting clusters run k-means with k=2 + +agglomerative clustering: +- starting from individual observations, produce sequence of clusterings of increasing size +- at each level, twoc lusters chosen by criterion are merged + +1. determine distances between all clusters +2. merge clusters that are closest +3. IF no. of clusters > 1 then GOTO 1 + +- which clusters to start with? +- what is the distance between clusters? +- final number of clusters? + +## different merging rules +- single linkage: two nearest objects in the clusters $$g(R,S)=\min_{ij}\{d(x_i,x_j):x_i\in R,x_j\in S\}$$ +- complete linkage: two most remote objects in the clusters: $$g(R,S)=\max_{ij}\{d(x_i,x_j):x_i\in R,x_j\in S\}$$ +- Average linkage: cluster centres: $$g(R,S)=\frac{1}{|R||S|}\sum_{ij}\{d(x_i,x_j):x_i\in R, x_j\in S\}$$ +hierarchical clustering: how it works? + +Input: + - dataset X:[n x p] or directly: + - dissimiliarity matrix, D:[n x n] + - linkage type + +output: dendrogram + + +![[Pasted image 20251106002203.png]] + + + +![[Pasted image 20251106002214.png]] +![[Pasted image 20251106002222.png]] +![[Pasted image 20251106002229.png]] +![[Pasted image 20251106002243.png]] +![[Pasted image 20251106002252.png]] +![[Pasted image 20251106002259.png]] +![[Pasted image 20251106002306.png]] +![[Pasted image 20251106002314.png]] +![[Pasted image 20251106002321.png]] +![[Pasted image 20251106002344.png]] + +(ok so this is about finding parirs and not first creating a bunch of pairs, merging them and then all that stuff - no you just start merging the most similar stuff, alrigth that checks out I was getting a bit confused about that in terms of order) + +![[Pasted image 20251106002418.png]] + + +![[Pasted image 20251106002426.png]] + +![[Pasted image 20251106002435.png]] + +![[Pasted image 20251106002443.png]] + +pros: +- dendrogram gives overview of all possible clusterings +- linkage type allows to find clusters of varying shapes +- different dissimilarity measures can be used + +cons: +- computationally intensive +- clustering limited to "hiearchical nestings" + +https://bost.ocks.org/mike/miserables/ + +http://benjiec.github.io/scatter-matrix/demo/demo.html# + +https://lvdmaaten.github.io/tsne/ + +## clustering summary: +- when we don't have (training) labels: clustering +- definition of clusters is vague and evaluation is hard +- for clustering we need to: + - define distance measure + - define criterion function to eval a clustering + - select clustering algo +- discussing algos: + - hierachical clustering + - k-means clusttering + +## dimensionality reduction +- typically, data sets are high-dimensional: each instance is described by many features +- why do we want to reduce data dimensionality? + - make storage or processing of data easier + - (visual) discovery of hiddene structure in the data + - remove redundant and noisy features + - intrinsic dimensionality might be smaller +- what does it mean to reduce dimensioanlity? +- how can we use Principal Component Analysis to reduce dimensionality? + +![[Pasted image 20251106002901.png]] + +### redundant features: +- get a population, predict some property + - instances represents as {urefu, height} pairs + - what is the dimensiaonlity of this data? (2?) + - height = urefu in Swahili + +so yea, cinema about that +![[Pasted image 20251106003018.png]] + + +- data points from different geographic areas over time: + - X1: # of skidding accidents + - X2: # of burst water pipes + - X3: # of snow plow expenditures + - X4: # of forest fires + - X5: # of patients with heat stroke + +... ok, but could we maybe just use temperature here? + +![[Pasted image 20251106003829.png]] + +![[Pasted image 20251106003852.png]] + +![[Pasted image 20251106003900.png]] + +typically: data sets are *high-dimensional*: each instance is described by many features + +## what does it mean to reduce dimensionality? + +## reducing dimensionality: methods +- feature selection + - pick a subset of the original dimensions + - use domain knowledge + - use statistics-based selection methods +- feature extraction + - construct a new set of dimensions $E_i=f(X_1,...,X_d)$![[Pasted image 20251106004052.png]] + - (linear) combinations of original + - whilst *preserving the structure* in the original data + +## feature extraction +- many important dimensionality reductions techniques are linear techniques +- these project the data onto a linear subspace of lower dimensionality (e.g. Principal Components Analysis) + +![[Pasted image 20251106004149.png]] + + +![[Pasted image 20251106004202.png]] +(thank you) + + +## using principal components analysis (PCA) +- pca maps the data ontoa linear subspace such that the variance of the projected data is maximized + +- defines a set of principal components + - 1st: direction of the greatest variability in the data + - 2nd: perpendicular to 1st, greatest variability of what's left + - ... and so on until d (original dimensionality) +- first m components become m new dimensions + - change coordinates of every data point to these dimensions + + +![[Pasted image 20251106004404.png]] + +![[Pasted image 20251106004416.png]] +![[Pasted image 20251106004428.png]] + +![[Pasted image 20251106004446.png]] + +## PCA optimisation problem +- PCA maps the data onto a linear subspace such that the variance of the projected data is maximized +- SO PCA performs maximisation $$\max_{||w||^2=1}\text{var}(w^TX)$$ +- Constriant: $w^Tw=1\rightarrow||w||^2=1$ + - requires w (direction of projection) to be unit vector (length is 1); solves scaling problem and guarantees unique solution + +## zero-mean data +- recall the definition of variance ![[Pasted image 20251106004843.png]] +- for PCA shift your data to zero-mean + - for each feature, calcualte the mean, subtract the mean from each data point in that feature +- helps with: + - means does not affect the caclulation of variance + - simplifies covariance matrix: $M=\frac{1}{n}XX^T$ + +![[Pasted image 20251106005003.png]] + +- our objective is to max. variance $$\max_{||w||^2=1}\text{var}(w^TX)$$ +- assuming zero-mean data: $$\text{var}(w^TX)=\frac{1}{n}(w^TX)(w^TX)^T=\frac{1}{n}w^TXX^Tw=w^TMw$$ +## lagrange multiplier +- introduce a language multiplier $\lambda$ to incorporate the constraint into our optimization $$L(w,\lambda)=w^TMw-\lambda(w^Tw-1)$$ +![[Pasted image 20251106005249.png]] +- it penalizes any deviation from the constraint + - if the constraint is violated (i.e. $w^Tw\neq 1$)it will either increase or decrease the value of the Lagrangian + +PCA opt. problem +- enforce constraint using lagrange multiplier +- ![[Pasted image 20251106005353.png]] +- set gradient with respect to $\mathbf{w}$ to zero: $$ +\begin{align} +\frac{\partial w^TMw-\lambda(w^Tw-1)}{\partial w}=0\\ +2Mw-2\lambda w=0\\ +Mw=\lambda w +\end{align}$$ +![[Pasted image 20251106005526.png]] + +principal components are given by the eigenvectors of the covariance matrix + +first principal component is given by the eigenvector with the correspoding highest eigenvalue etc. + +![[Pasted image 20251106005628.png]] + +![[Pasted image 20251106005655.png]] + +![[Pasted image 20251106005719.png]] +(honetly I dont get this because I dont get eigenpairs, eigenvectors and eigenvalues entirely, so I will have to look at this) TODO + +![[Pasted image 20251106005742.png]] + +![[Pasted image 20251106005803.png]] +(though this overview at least makes some sense to me so that is good) + +(there is an example on the slides as well, check that out perhaps) + +![[Pasted image 20251106005851.png]] + +![[Pasted image 20251106005919.png]] +![[Pasted image 20251106005956.png]] + +![[Pasted image 20251106010007.png]] + +![[Pasted image 20251106010018.png]] + +PCA has some practical issues: +- covariance extremely sensitive to large values + - multiple some dimensions by 1000 + - dominantes covariances + - become a principal compoennt + - normalize each dimension to zero mean and unit variance (formula was somewhere above) +- PCA assumes underlying subspace is linear + - 1d: straightline, 2d: plane + +## PCA and classification +- pca is unsupervised + - max. the overall variance of the data along a small set of directions + - does knot know anything about class labels + - can pick direction that makes it hard to separate classes +- discriminative approach + - look for a dimension that makes it easy to separate classes + +pros: +- reflects our intuitions about the data +- dramatic reduction in size of data + - faster processing (as long as reduction is fast), smaller storage + +Cons: +- too expensive for many applications +- understand asumptions behind the methods (linearity, etc.) + + +![[Pasted image 20251106010304.png]] + +recap: +- dimensionality reductions builds a condensed data representation +- this removes redundant or noisy features and identifies correlations +- PCA projects data onto the principal eigenvectors fo teh covariance matrix: max. variance of the projection + +http://setosa.io/ev/principal-component-analysis/ + +http://peterbloem.nl/blog/pca + +(there is something about Convolutional neural networks but it seems to be optional - I think I went to this last time actually but hey) \ No newline at end of file