diff --git a/.gitignore b/.gitignore index 0ce0489..0e00769 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,9 @@ */target */key */key_debug -*/*sqlite* +*/*sqlite +*/*sqlite-shm +*/*sqlite-wal */deploy*.sh anti_nft_spam_bot/proxies.txt .cache diff --git a/Cargo.lock b/Cargo.lock index 00f720e..c34abe2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,26 +2,11 @@ # It is not intended for manual editing. version = 4 -[[package]] -name = "addr2line" -version = "0.25.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b5d307320b3181d6d7954e663bd7c774a838b8220fe0593c86d9fb09f498b4b" -dependencies = [ - "gimli", -] - -[[package]] -name = "adler2" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" - [[package]] name = "aho-corasick" -version = "1.1.3" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" dependencies = [ "memchr", ] @@ -48,9 +33,13 @@ dependencies = [ "arch_bot_commons", "chrono", "crc32fast", + "flume", + "futures-util", + "hex", "html-escape", "log", "native-tls", + "percent-encoding", "reqwest", "sqlx", "teloxide", @@ -72,6 +61,15 @@ dependencies = [ "syn", ] +[[package]] +name = "ar_archive_writer" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0c269894b6fe5e9d7ada0cf69b5bf847ff35bc25fc271f08e1d080fce80339a" +dependencies = [ + "object", +] + [[package]] name = "arch_bot_commons" version = "0.6.6" @@ -105,21 +103,6 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" -[[package]] -name = "backtrace" -version = "0.3.76" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb531853791a215d7c62a30daf0dde835f381ab5de4589cfe7c649d2cbe92bd6" -dependencies = [ - "addr2line", - "cfg-if", - "libc", - "miniz_oxide", - "object", - "rustc-demangle", - "windows-link 0.2.0", -] - [[package]] name = "base64" version = "0.22.1" @@ -154,11 +137,11 @@ dependencies = [ [[package]] name = "bitflags" -version = "2.9.4" +version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394" +checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" dependencies = [ - "serde", + "serde_core", ] [[package]] @@ -178,9 +161,9 @@ checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" [[package]] name = "bytemuck" -version = "1.23.2" +version = "1.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3995eaeebcdf32f91f980d360f78732ddc061097ab4e39991ae7a6ace9194677" +checksum = "1fbdf580320f38b612e485521afda1ee26d10cc9884efaaa750d383e13e3c5f4" [[package]] name = "byteorder" @@ -190,15 +173,15 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.10.1" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" +checksum = "b35204fbdc0b3f4446b89fc1ac2cf84a8a68971995d0bf2e925ec7cd960f9cb3" [[package]] name = "cc" -version = "1.2.39" +version = "1.2.46" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1354349954c6fc9cb0deab020f27f783cf0b604e8bb754dc4658ecf0d29c35f" +checksum = "b97463e1064cb1b1c1384ad0a0b9c8abd0988e2a91f52606c80ef14aadb63e36" dependencies = [ "find-msvc-tools", "shlex", @@ -215,9 +198,9 @@ dependencies = [ [[package]] name = "cfg-if" -version = "1.0.3" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" [[package]] name = "chrono" @@ -230,7 +213,7 @@ dependencies = [ "num-traits", "serde", "wasm-bindgen", - "windows-link 0.2.0", + "windows-link", ] [[package]] @@ -362,9 +345,9 @@ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "crypto-common" -version = "0.1.6" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" dependencies = [ "generic-array", "typenum", @@ -418,9 +401,9 @@ dependencies = [ [[package]] name = "deranged" -version = "0.5.4" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a41953f86f8a05768a6cda24def994fd2f424b04ec5c719cf89989779f199071" +checksum = "ececcb659e7ba858fb4f10388c250a7252eb0a27373f1a72b8748afdd248e587" dependencies = [ "powerfmt", "serde_core", @@ -546,7 +529,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.61.1", + "windows-sys 0.61.2", ] [[package]] @@ -579,9 +562,9 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "find-msvc-tools" -version = "0.1.2" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ced73b1dacfc750a6db6c0a0c3a3853c8b41997e2e2c563dc90804ae6867959" +checksum = "3a3076410a55c90011c298b04d0cfa770b00fa04e1e3c97d3f6c9de105a03844" [[package]] name = "flume" @@ -591,6 +574,7 @@ checksum = "da0e4dd2a88388a1f4ccc7c9ce104604dab68d9f408dc34cd45823d5a9069095" dependencies = [ "futures-core", "futures-sink", + "nanorand", "spin", ] @@ -747,28 +731,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" dependencies = [ "cfg-if", + "js-sys", "libc", - "wasi 0.11.1+wasi-snapshot-preview1", + "wasi", + "wasm-bindgen", ] [[package]] name = "getrandom" -version = "0.3.3" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ "cfg-if", "libc", "r-efi", - "wasi 0.14.7+wasi-0.2.4", + "wasip2", ] -[[package]] -name = "gimli" -version = "0.32.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e629b9b98ef3dd8afe6ca2bd0f89306cec16d43d907889945bc5d6687f2f13c7" - [[package]] name = "glob" version = "0.3.3" @@ -787,7 +767,7 @@ dependencies = [ "futures-core", "futures-sink", "http", - "indexmap 2.11.4", + "indexmap 2.12.0", "slab", "tokio", "tokio-util", @@ -879,11 +859,11 @@ dependencies = [ [[package]] name = "home" -version = "0.5.11" +version = "0.5.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "589533453244b0995c858700322199b2becb13b627df2851f64a2775d024abcf" +checksum = "cc627f471c528ff0c4a49e1d5e60450c8f6461dd6d10ba9dcd3a61d3dff7728d" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -943,9 +923,9 @@ checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" [[package]] name = "hyper" -version = "1.7.0" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb3aa54a13a0dfe7fbe3a59e0c76093041720fdc77b110cc0fc260fafb4dc51e" +checksum = "2ab2d4f250c3d7b1c9fcdff1cece94ea4e2dfbec68614f7b87cb205f24ca9d11" dependencies = [ "atomic-waker", "bytes", @@ -997,9 +977,9 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.17" +version = "0.1.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c6995591a8f1380fcb4ba966a252a4b29188d51d2b89e3a252f5305be65aea8" +checksum = "52e9a2a24dc5c6821e71a7030e1e14b7b632acac55c40e9d2e082c621261bb56" dependencies = [ "base64", "bytes", @@ -1047,9 +1027,9 @@ dependencies = [ [[package]] name = "icu_collections" -version = "2.0.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "200072f5d0e3614556f94a9930d5dc3e0662a652823904c3a75dc3b0af7fee47" +checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43" dependencies = [ "displaydoc", "potential_utf", @@ -1060,9 +1040,9 @@ dependencies = [ [[package]] name = "icu_locale_core" -version = "2.0.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0cde2700ccaed3872079a65fb1a78f6c0a36c91570f28755dda67bc8f7d9f00a" +checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6" dependencies = [ "displaydoc", "litemap", @@ -1073,11 +1053,10 @@ dependencies = [ [[package]] name = "icu_normalizer" -version = "2.0.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "436880e8e18df4d7bbc06d58432329d6458cc84531f7ac5f024e93deadb37979" +checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599" dependencies = [ - "displaydoc", "icu_collections", "icu_normalizer_data", "icu_properties", @@ -1088,42 +1067,38 @@ dependencies = [ [[package]] name = "icu_normalizer_data" -version = "2.0.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00210d6893afc98edb752b664b8890f0ef174c8adbb8d0be9710fa66fbbf72d3" +checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a" [[package]] name = "icu_properties" -version = "2.0.1" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "016c619c1eeb94efb86809b015c58f479963de65bdb6253345c1a1276f22e32b" +checksum = "e93fcd3157766c0c8da2f8cff6ce651a31f0810eaa1c51ec363ef790bbb5fb99" dependencies = [ - "displaydoc", "icu_collections", "icu_locale_core", "icu_properties_data", "icu_provider", - "potential_utf", "zerotrie", "zerovec", ] [[package]] name = "icu_properties_data" -version = "2.0.1" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "298459143998310acd25ffe6810ed544932242d3f07083eee1084d83a71bd632" +checksum = "02845b3647bb045f1100ecd6480ff52f34c35f82d9880e029d329c21d1054899" [[package]] name = "icu_provider" -version = "2.0.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03c80da27b5f4187909049ee2d72f276f0d9f99a42c306bd0131ecfe04d8e5af" +checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614" dependencies = [ "displaydoc", "icu_locale_core", - "stable_deref_trait", - "tinystr", "writeable", "yoke", "zerofrom", @@ -1190,9 +1165,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.11.4" +version = "2.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b0f83760fb341a774ed326568e19f5a863af4a952def8c39f9ab92fd95b88e5" +checksum = "6717a8d2a5a929a1a2eb43a12812498ed141a0bcfb7e8f7844fbdbe4303bba9f" dependencies = [ "equivalent", "hashbrown 0.16.0", @@ -1200,17 +1175,6 @@ dependencies = [ "serde_core", ] -[[package]] -name = "io-uring" -version = "0.7.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "046fa2d4d00aea763528b4950358d0ead425372445dc8ff86312b3c69ff7727b" -dependencies = [ - "bitflags", - "cfg-if", - "libc", -] - [[package]] name = "ipnet" version = "2.11.0" @@ -1219,9 +1183,9 @@ checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" [[package]] name = "iri-string" -version = "0.7.8" +version = "0.7.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbc5ebe9c3a1a7a5127f920a418f7585e9e758e911d0466ed004f393b0e380b2" +checksum = "4f867b9d1d896b67beb18518eda36fdb77a32ea590de864f1325b294a6d14397" dependencies = [ "memchr", "serde", @@ -1229,13 +1193,13 @@ dependencies = [ [[package]] name = "is-terminal" -version = "0.4.16" +version = "0.4.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e04d7f318608d35d4b61ddd75cbdaee86b023ebe2bd5a66ee0915f0bf93095a9" +checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" dependencies = [ "hermit-abi", "libc", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -1264,9 +1228,9 @@ checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" [[package]] name = "js-sys" -version = "0.3.81" +version = "0.3.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec48937a97411dcb524a265206ccd4c90bb711fca92b2792c407f268825b9305" +checksum = "b011eec8cc36da2aab2d5cff675ec18454fad408585853910a202391cf9f8e65" dependencies = [ "once_cell", "wasm-bindgen", @@ -1283,9 +1247,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.176" +version = "0.2.177" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58f929b4d672ea937a23a1ab494143d968337a5f47e56d0815df1e0890ddf174" +checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" [[package]] name = "libloading" @@ -1294,7 +1258,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" dependencies = [ "cfg-if", - "windows-link 0.2.0", + "windows-link", ] [[package]] @@ -1333,17 +1297,16 @@ checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" [[package]] name = "litemap" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956" +checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" [[package]] name = "lock_api" -version = "0.4.13" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96936507f153605bddfcda068dd804796c84324ed2510809e5b2a624c81da765" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" dependencies = [ - "autocfg", "scopeguard", ] @@ -1403,23 +1366,23 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" [[package]] -name = "miniz_oxide" -version = "0.8.9" +name = "mio" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" +checksum = "69d83b0086dc8ecf3ce9ae2874b2d1290252e2a30720bea58a5c6639b0092873" dependencies = [ - "adler2", + "libc", + "wasi", + "windows-sys 0.61.2", ] [[package]] -name = "mio" -version = "1.0.4" +name = "nanorand" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78bed444cc8a2160f01cbcf811ef18cac863ad68ae8ca62092e8db51d51c761c" +checksum = "6a51313c5820b0b02bd422f4b44776fbf47961755c74ce64afc73bfad10226c3" dependencies = [ - "libc", - "wasi 0.11.1+wasi-snapshot-preview1", - "windows-sys 0.59.0", + "getrandom 0.2.16", ] [[package]] @@ -1451,11 +1414,10 @@ dependencies = [ [[package]] name = "num-bigint-dig" -version = "0.8.4" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc84195820f291c7697304f3cbdadd1cb7199c0efc917ff5eafd71225c136151" +checksum = "e661dda6640fad38e827a6d4a310ff4763082116fe217f279885c97f511bb0b7" dependencies = [ - "byteorder", "lazy_static", "libm", "num-integer", @@ -1504,9 +1466,9 @@ dependencies = [ [[package]] name = "object" -version = "0.37.3" +version = "0.32.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe" +checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" dependencies = [ "memchr", ] @@ -1519,9 +1481,9 @@ checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" [[package]] name = "openssl" -version = "0.10.73" +version = "0.10.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8505734d46c8ab1e19a1dce3aef597ad87dcb4c37e7188231769bd6bd51cebf8" +checksum = "08838db121398ad17ab8531ce9de97b244589089e290a384c900cb9ff7434328" dependencies = [ "bitflags", "cfg-if", @@ -1551,18 +1513,18 @@ checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" [[package]] name = "openssl-src" -version = "300.5.2+3.5.2" +version = "300.5.4+3.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d270b79e2926f5150189d475bc7e9d2c69f9c4697b185fa917d5a32b792d21b4" +checksum = "a507b3792995dae9b0df8a1c1e3771e8418b7c2d9f0baeba32e6fe8b06c7cb72" dependencies = [ "cc", ] [[package]] name = "openssl-sys" -version = "0.9.109" +version = "0.9.111" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90096e2e47630d78b7d1c20952dc621f957103f8bc2c8359ec81290d75238571" +checksum = "82cab2d520aa75e3c58898289429321eb788c3106963d0dc886ec7a5f4adc321" dependencies = [ "cc", "libc", @@ -1579,9 +1541,9 @@ checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba" [[package]] name = "parking_lot" -version = "0.12.4" +version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70d58bf43669b5795d1576d0641cfb6fbb2057bf629506267a92807158584a13" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" dependencies = [ "lock_api", "parking_lot_core", @@ -1589,15 +1551,15 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.11" +version = "0.9.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc838d2a56b5b1a6c25f55575dfc605fabb63bb2365f6c2353ef9159aa69e4a5" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" dependencies = [ "cfg-if", "libc", "redox_syscall", "smallvec", - "windows-targets 0.52.6", + "windows-link", ] [[package]] @@ -1676,9 +1638,9 @@ checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" [[package]] name = "potential_utf" -version = "0.1.3" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84df19adbe5b5a0782edcab45899906947ab039ccf4573713735ee7de1e6b08a" +checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77" dependencies = [ "zerovec", ] @@ -1741,27 +1703,28 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.101" +version = "1.0.103" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" +checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8" dependencies = [ "unicode-ident", ] [[package]] name = "psm" -version = "0.1.26" +version = "0.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e944464ec8536cd1beb0bbfd96987eb5e3b72f2ecdafdc5c769a37f1fa2ae1f" +checksum = "d11f2fedc3b7dafdc2851bc52f277377c5473d378859be234bc7ebb593144d01" dependencies = [ + "ar_archive_writer", "cc", ] [[package]] name = "quote" -version = "1.0.40" +version = "1.0.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f" dependencies = [ "proc-macro2", ] @@ -1828,7 +1791,7 @@ version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" dependencies = [ - "getrandom 0.3.3", + "getrandom 0.3.4", ] [[package]] @@ -1862,27 +1825,27 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.5.17" +version = "0.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5407465600fb0548f1442edf71dd20683c6ed326200ace4b1ef0763521bb3b77" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" dependencies = [ "bitflags", ] [[package]] name = "ref-cast" -version = "1.0.24" +version = "1.0.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a0ae411dbe946a674d89546582cea4ba2bb8defac896622d6496f14c23ba5cf" +checksum = "f354300ae66f76f1c85c5f84693f0ce81d747e2c3f21a45fef496d89c960bf7d" dependencies = [ "ref-cast-impl", ] [[package]] name = "ref-cast-impl" -version = "1.0.24" +version = "1.0.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1165225c21bff1f3bbce98f5a1f889949bc902d3575308cc7b0de30b4f6d27c7" +checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da" dependencies = [ "proc-macro2", "quote", @@ -1891,9 +1854,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.11.3" +version = "1.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b5288124840bee7b386bc413c487869b360b2b4ec421ea56425128692f2a82c" +checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" dependencies = [ "aho-corasick", "memchr", @@ -1903,9 +1866,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.11" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "833eb9ce86d40ef33cb1306d8accf7bc8ec2bfea4355cbdebb3df68b40925cad" +checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" dependencies = [ "aho-corasick", "memchr", @@ -1914,15 +1877,15 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.8.6" +version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001" +checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" [[package]] name = "reqwest" -version = "0.12.23" +version = "0.12.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d429f34c8092b2d42c7c93cec323bb4adeb7c67698f70839adec842ec10c7ceb" +checksum = "9d0946410b9f7b082a427e4ef5c8ff541a88b357bc6c637c40db3a68ac70a36f" dependencies = [ "base64", "bytes", @@ -1987,9 +1950,9 @@ dependencies = [ [[package]] name = "rsa" -version = "0.9.8" +version = "0.9.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78928ac1ed176a5ca1d17e578a1825f3d81ca54cf41053a592584b020cfd691b" +checksum = "40a0376c50d0358279d9d643e4bf7b7be212f1f4ff1da9070a7b54d22ef75c88" dependencies = [ "const-oid", "digest", @@ -2005,12 +1968,6 @@ dependencies = [ "zeroize", ] -[[package]] -name = "rustc-demangle" -version = "0.1.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace" - [[package]] name = "rustc-hash" version = "2.1.1" @@ -2027,14 +1984,14 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys 0.61.1", + "windows-sys 0.61.2", ] [[package]] name = "rustls" -version = "0.23.32" +version = "0.23.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd3c25631629d034ce7cd9940adc9d45762d46de2b0f57193c4443b92c6d4d40" +checksum = "533f54bc6a7d4f647e46ad909549eda97bf5afc1585190ef692b4286b198bd8f" dependencies = [ "once_cell", "ring", @@ -2046,18 +2003,18 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.12.0" +version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "229a4a4c221013e7e1f1a043678c5cc39fe5171437c88fb47151a21e6f5b5c79" +checksum = "94182ad936a0c91c324cd46c6511b9510ed16af436d7b5bab34beab0afd55f7a" dependencies = [ "zeroize", ] [[package]] name = "rustls-webpki" -version = "0.103.6" +version = "0.103.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8572f3c2cb9934231157b45499fc41e1f58c589fdfb81a844ba873265e80f8eb" +checksum = "2ffdfa2f5286e2247234e03f680868ac2815974dc39e00ea15adc445d0aafe52" dependencies = [ "ring", "rustls-pki-types", @@ -2082,7 +2039,7 @@ version = "0.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "891d81b926048e76efe18581bf793546b4c0eaf8448d72be8de2bbee5fd166e1" dependencies = [ - "windows-sys 0.61.1", + "windows-sys 0.61.2", ] [[package]] @@ -2099,9 +2056,9 @@ dependencies = [ [[package]] name = "schemars" -version = "1.0.4" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82d20c4491bc164fa2f6c5d44565947a52ad80b9505d8e36f8d54c27c739fcd0" +checksum = "9558e172d4e8533736ba97870c4b2cd63f84b382a3d6eb063da41b91cce17289" dependencies = [ "dyn-clone", "ref-cast", @@ -2195,19 +2152,18 @@ dependencies = [ [[package]] name = "serde_with" -version = "3.14.1" +version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c522100790450cf78eeac1507263d0a350d4d5b30df0c8e1fe051a10c22b376e" +checksum = "10574371d41b0d9b2cff89418eda27da52bcaff2cc8741db26382a77c29131f1" dependencies = [ "base64", "chrono", "hex", "indexmap 1.9.3", - "indexmap 2.11.4", + "indexmap 2.12.0", "schemars 0.9.0", - "schemars 1.0.4", - "serde", - "serde_derive", + "schemars 1.1.0", + "serde_core", "serde_json", "serde_with_macros", "time", @@ -2215,9 +2171,9 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "3.14.1" +version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "327ada00f7d64abaac1e55a6911e90cf665aa051b9a561c7006c157f4633135e" +checksum = "08a72d8216842fdd57820dc78d840bef99248e35fb2554ff923319e60f2d686b" dependencies = [ "darling", "proc-macro2", @@ -2289,12 +2245,12 @@ dependencies = [ [[package]] name = "socket2" -version = "0.6.0" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "233504af464074f9d066d7b5416c5f9b894a5862a6506e306f7b816cdd6f1807" +checksum = "17129e116933cf371d018bb80ae557e889637989d8638274fb25622827b03881" dependencies = [ "libc", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] @@ -2348,7 +2304,7 @@ dependencies = [ "futures-util", "hashbrown 0.15.5", "hashlink", - "indexmap 2.11.4", + "indexmap 2.12.0", "log", "memchr", "once_cell", @@ -2512,15 +2468,15 @@ dependencies = [ [[package]] name = "stable_deref_trait" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" [[package]] name = "stacker" -version = "0.1.21" +version = "0.1.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cddb07e32ddb770749da91081d8d0ac3a16f1a569a18b20348cd371f5dead06b" +checksum = "e1f8b29fb42aafcea4edeeb6b2f2d7ecd0d969c48b4cf0d2e64aafc471dd6e59" dependencies = [ "cc", "cfg-if", @@ -2554,9 +2510,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" -version = "2.0.106" +version = "2.0.110" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6" +checksum = "a99801b5bd34ede4cf3fc688c5919368fea4e4814a4664359503e6015b280aea" dependencies = [ "proc-macro2", "quote", @@ -2706,10 +2662,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16" dependencies = [ "fastrand", - "getrandom 0.3.3", + "getrandom 0.3.4", "once_cell", "rustix", - "windows-sys 0.61.1", + "windows-sys 0.61.2", ] [[package]] @@ -2723,18 +2679,18 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.16" +version = "2.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3467d614147380f2e4e374161426ff399c91084acd2363eaf549172b3d5e60c0" +checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "2.0.16" +version = "2.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c5e1be1c48b9172ee610da68fd9cd2770e7a4056cb3fc98710ee6906f0c7960" +checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" dependencies = [ "proc-macro2", "quote", @@ -2774,9 +2730,9 @@ dependencies = [ [[package]] name = "tinystr" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b" +checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869" dependencies = [ "displaydoc", "zerovec", @@ -2799,29 +2755,26 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.47.1" +version = "1.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89e49afdadebb872d3145a5638b59eb0691ea23e46ca484037cfab3b76b95038" +checksum = "ff360e02eab121e0bc37a2d3b4d4dc622e6eda3a8e5253d5435ecf5bd4c68408" dependencies = [ - "backtrace", "bytes", - "io-uring", "libc", "mio", "parking_lot", "pin-project-lite", "signal-hook-registry", - "slab", "socket2", "tokio-macros", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] name = "tokio-macros" -version = "2.5.0" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" +checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" dependencies = [ "proc-macro2", "quote", @@ -2861,9 +2814,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.16" +version = "0.7.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14307c986784f72ef81c89db7d9e28d6ac26d16213b109ea501696195e6e3ce5" +checksum = "2efa149fe76073d6e8fd97ef4f4eca7b67f599660115591483572e406e165594" dependencies = [ "bytes", "futures-core", @@ -2957,9 +2910,9 @@ checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" [[package]] name = "typenum" -version = "1.18.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" +checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" [[package]] name = "unicase" @@ -2975,24 +2928,24 @@ checksum = "5c1cb5db39152898a79168971543b1cb5020dff7fe43c8dc468b0885f5e29df5" [[package]] name = "unicode-ident" -version = "1.0.19" +version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d" +checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" [[package]] name = "unicode-normalization" -version = "0.1.24" +version = "0.1.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" +checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8" dependencies = [ "tinyvec", ] [[package]] name = "unicode-properties" -version = "0.1.3" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e70f2a8b45122e719eb623c01822704c4e0907e7e426a05927e1a1cfff5b75d0" +checksum = "7df058c713841ad818f1dc5d3fd88063241cc61f49f5fbea4b951e8cf5a8d71d" [[package]] name = "unicode-segmentation" @@ -3042,7 +2995,7 @@ version = "1.18.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2" dependencies = [ - "getrandom 0.3.3", + "getrandom 0.3.4", "js-sys", "wasm-bindgen", ] @@ -3074,15 +3027,6 @@ version = "0.11.1+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" -[[package]] -name = "wasi" -version = "0.14.7+wasi-0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "883478de20367e224c0090af9cf5f9fa85bed63a95c1abf3afc5c083ebc06e8c" -dependencies = [ - "wasip2", -] - [[package]] name = "wasip2" version = "1.0.1+wasi-0.2.4" @@ -3100,9 +3044,9 @@ checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b" [[package]] name = "wasm-bindgen" -version = "0.2.104" +version = "0.2.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1da10c01ae9f1ae40cbfac0bac3b1e724b320abfcf52229f80b547c0d250e2d" +checksum = "da95793dfc411fbbd93f5be7715b0578ec61fe87cb1a42b12eb625caa5c5ea60" dependencies = [ "cfg-if", "once_cell", @@ -3111,25 +3055,11 @@ dependencies = [ "wasm-bindgen-shared", ] -[[package]] -name = "wasm-bindgen-backend" -version = "0.2.104" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "671c9a5a66f49d8a47345ab942e2cb93c7d1d0339065d4f8139c486121b43b19" -dependencies = [ - "bumpalo", - "log", - "proc-macro2", - "quote", - "syn", - "wasm-bindgen-shared", -] - [[package]] name = "wasm-bindgen-futures" -version = "0.4.54" +version = "0.4.55" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e038d41e478cc73bae0ff9b36c60cff1c98b8f38f8d7e8061e79ee63608ac5c" +checksum = "551f88106c6d5e7ccc7cd9a16f312dd3b5d36ea8b4954304657d5dfba115d4a0" dependencies = [ "cfg-if", "js-sys", @@ -3140,9 +3070,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.104" +version = "0.2.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ca60477e4c59f5f2986c50191cd972e3a50d8a95603bc9434501cf156a9a119" +checksum = "04264334509e04a7bf8690f2384ef5265f05143a4bff3889ab7a3269adab59c2" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -3150,22 +3080,22 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.104" +version = "0.2.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f07d2f20d4da7b26400c9f4a0511e6e0345b040694e8a75bd41d578fa4421d7" +checksum = "420bc339d9f322e562942d52e115d57e950d12d88983a14c79b86859ee6c7ebc" dependencies = [ + "bumpalo", "proc-macro2", "quote", "syn", - "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.104" +version = "0.2.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bad67dc8b2a1a6e5448428adec4c3e84c43e561d8c9ee8a9e5aabeb193ec41d1" +checksum = "76f218a38c84bcb33c25ec7059b07847d465ce0e0a76b995e134a45adcb6af76" dependencies = [ "unicode-ident", ] @@ -3185,9 +3115,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.81" +version = "0.3.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9367c417a924a74cae129e6a2ae3b47fabb1f8995595ab474029da749a8be120" +checksum = "3a1f95c0d03a47f4ae1f7a64643a6bb97465d9b740f0fa8f90ea33915c99a9a1" dependencies = [ "js-sys", "wasm-bindgen", @@ -3199,14 +3129,14 @@ version = "0.26.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9" dependencies = [ - "webpki-roots 1.0.2", + "webpki-roots 1.0.4", ] [[package]] name = "webpki-roots" -version = "1.0.2" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e8983c3ab33d6fb807cfcdad2491c4ea8cbc8ed839181c7dfd9c67c83e261b2" +checksum = "b2878ef029c47c6e8cf779119f20fcf52bde7ad42a731b2a304bc221df17571e" dependencies = [ "rustls-pki-types", ] @@ -3227,27 +3157,27 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.61.1", + "windows-sys 0.61.2", ] [[package]] name = "windows-core" -version = "0.62.1" +version = "0.62.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6844ee5416b285084d3d3fffd743b925a6c9385455f64f6d4fa3031c4c2749a9" +checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" dependencies = [ "windows-implement", "windows-interface", - "windows-link 0.2.0", - "windows-result 0.4.0", - "windows-strings 0.5.0", + "windows-link", + "windows-result", + "windows-strings", ] [[package]] name = "windows-implement" -version = "0.60.1" +version = "0.60.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edb307e42a74fb6de9bf3a02d9712678b22399c87e6fa869d6dfcd8c1b7754e0" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", "quote", @@ -3256,9 +3186,9 @@ dependencies = [ [[package]] name = "windows-interface" -version = "0.59.2" +version = "0.59.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0abd1ddbc6964ac14db11c7213d6532ef34bd9aa042c2e5935f59d7908b46a5" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", "quote", @@ -3267,61 +3197,37 @@ dependencies = [ [[package]] name = "windows-link" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a" - -[[package]] -name = "windows-link" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45e46c0661abb7180e7b9c281db115305d49ca1709ab8242adf09666d2173c65" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" [[package]] name = "windows-registry" -version = "0.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b8a9ed28765efc97bbc954883f4e6796c33a06546ebafacbabee9696967499e" -dependencies = [ - "windows-link 0.1.3", - "windows-result 0.3.4", - "windows-strings 0.4.2", -] - -[[package]] -name = "windows-result" -version = "0.3.4" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56f42bd332cc6c8eac5af113fc0c1fd6a8fd2aa08a0119358686e5160d0586c6" +checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720" dependencies = [ - "windows-link 0.1.3", + "windows-link", + "windows-result", + "windows-strings", ] [[package]] name = "windows-result" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7084dcc306f89883455a206237404d3eaf961e5bd7e0f312f7c91f57eb44167f" -dependencies = [ - "windows-link 0.2.0", -] - -[[package]] -name = "windows-strings" -version = "0.4.2" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56e6c93f3a0c3b36176cb1327a4958a0353d5d166c2a35cb268ace15e91d3b57" +checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" dependencies = [ - "windows-link 0.1.3", + "windows-link", ] [[package]] name = "windows-strings" -version = "0.5.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7218c655a553b0bed4426cf54b20d7ba363ef543b52d515b3e48d7fd55318dda" +checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" dependencies = [ - "windows-link 0.2.0", + "windows-link", ] [[package]] @@ -3353,11 +3259,20 @@ dependencies = [ [[package]] name = "windows-sys" -version = "0.61.1" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.5", +] + +[[package]] +name = "windows-sys" +version = "0.61.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f109e41dd4a3c848907eb83d5a42ea98b3769495597450cf6d153507b166f0f" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" dependencies = [ - "windows-link 0.2.0", + "windows-link", ] [[package]] @@ -3384,13 +3299,30 @@ dependencies = [ "windows_aarch64_gnullvm 0.52.6", "windows_aarch64_msvc 0.52.6", "windows_i686_gnu 0.52.6", - "windows_i686_gnullvm", + "windows_i686_gnullvm 0.52.6", "windows_i686_msvc 0.52.6", "windows_x86_64_gnu 0.52.6", "windows_x86_64_gnullvm 0.52.6", "windows_x86_64_msvc 0.52.6", ] +[[package]] +name = "windows-targets" +version = "0.53.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" +dependencies = [ + "windows-link", + "windows_aarch64_gnullvm 0.53.1", + "windows_aarch64_msvc 0.53.1", + "windows_i686_gnu 0.53.1", + "windows_i686_gnullvm 0.53.1", + "windows_i686_msvc 0.53.1", + "windows_x86_64_gnu 0.53.1", + "windows_x86_64_gnullvm 0.53.1", + "windows_x86_64_msvc 0.53.1", +] + [[package]] name = "windows_aarch64_gnullvm" version = "0.48.5" @@ -3403,6 +3335,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" + [[package]] name = "windows_aarch64_msvc" version = "0.48.5" @@ -3415,6 +3353,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" + [[package]] name = "windows_i686_gnu" version = "0.48.5" @@ -3427,12 +3371,24 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" +[[package]] +name = "windows_i686_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" + [[package]] name = "windows_i686_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" + [[package]] name = "windows_i686_msvc" version = "0.48.5" @@ -3445,6 +3401,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" +[[package]] +name = "windows_i686_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" + [[package]] name = "windows_x86_64_gnu" version = "0.48.5" @@ -3457,6 +3419,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" + [[package]] name = "windows_x86_64_gnullvm" version = "0.48.5" @@ -3469,6 +3437,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" + [[package]] name = "windows_x86_64_msvc" version = "0.48.5" @@ -3481,6 +3455,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" + [[package]] name = "wit-bindgen" version = "0.46.0" @@ -3489,17 +3469,16 @@ checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" [[package]] name = "writeable" -version = "0.6.1" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb" +checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" [[package]] name = "yoke" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f41bb01b8226ef4bfd589436a297c53d118f65921786300e427be8d487695cc" +checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954" dependencies = [ - "serde", "stable_deref_trait", "yoke-derive", "zerofrom", @@ -3507,9 +3486,9 @@ dependencies = [ [[package]] name = "yoke-derive" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" +checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" dependencies = [ "proc-macro2", "quote", @@ -3560,15 +3539,15 @@ dependencies = [ [[package]] name = "zeroize" -version = "1.8.1" +version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" +checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" [[package]] name = "zerotrie" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36f0bbd478583f79edad978b407914f61b2972f5af6fa089686016be8f9af595" +checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851" dependencies = [ "displaydoc", "yoke", @@ -3577,9 +3556,9 @@ dependencies = [ [[package]] name = "zerovec" -version = "0.11.4" +version = "0.11.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7aa2bd55086f1ab526693ecbe444205da57e25f4489879da80635a46d90e73b" +checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002" dependencies = [ "yoke", "zerofrom", @@ -3588,9 +3567,9 @@ dependencies = [ [[package]] name = "zerovec-derive" -version = "0.11.1" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" +checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" dependencies = [ "proc-macro2", "quote", diff --git a/anti_nft_spam_bot/Cargo.toml b/anti_nft_spam_bot/Cargo.toml index 35c3384..fe60149 100644 --- a/anti_nft_spam_bot/Cargo.toml +++ b/anti_nft_spam_bot/Cargo.toml @@ -9,11 +9,15 @@ edition = "2021" arch_bot_commons = { version = "0.6.5", path = "../arch_bot_commons" } chrono = "0.4.34" crc32fast = "1.4.2" +flume = "0.11.1" +futures-util = "0.3.31" +hex = "0.4.3" html-escape = "0.2.13" log = "0.4.17" # This seems to depend on OpenSSL 3.3.0, but Fedora Server 40 only has 3.2.1. # Use "vendored" feature to work around that lmao native-tls = { version = "0.2.12", features = ["vendored"] } +percent-encoding = "2.3.2" reqwest = { version = "0.12.22", features = ["socks"] } sqlx = { version = "0.8.2", features = [ "sqlite", diff --git a/anti_nft_spam_bot/sqlite3_expert.py b/anti_nft_spam_bot/sqlite3_expert.py new file mode 100755 index 0000000..15e4b0c --- /dev/null +++ b/anti_nft_spam_bot/sqlite3_expert.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 + +# Really messy and hacky, but whatever lol + +import re +import subprocess +import os + +with open("src/database/mod.rs", "r") as f: + data = f.read() + + +extract_queries = re.compile(r"sqlx::query\(\s*\"([\w\W]*?)\"", re.MULTILINE); + +for match in extract_queries.finditer(data): + query = match.group(1) + if "CREATE TABLE" in query: + continue + + if not query.endswith(';'): + query += ";" + + query += "\n" + + print(query) + + subprocess.call(["sqlite3_expert", "-sql", query, "anti_nft_spam_bot.sqlite"]) + diff --git a/anti_nft_spam_bot/src/actions.rs b/anti_nft_spam_bot/src/actions.rs new file mode 100644 index 0000000..302e1ce --- /dev/null +++ b/anti_nft_spam_bot/src/actions.rs @@ -0,0 +1,588 @@ +use arch_bot_commons::{teloxide_retry, useful_methods::BotArchSendMsg}; +use futures_util::TryStreamExt; +use html_escape::encode_text; +use teloxide::{ + payloads::{EditMessageTextSetters, SendMessageSetters}, + prelude::Requester, + sugar::request::{RequestLinkPreviewExt, RequestReplyExt}, + types::{ChatId, InlineKeyboardMarkup, MediaGroupId, Message, MessageId, User}, + ApiError, Bot, RequestError, +}; +use url::Url; + +use crate::{ + database::{Database, InsertOrUpdateResult, UrlInfoFull}, + misc::{chat_name_prettyprint, sender_name_prettyprint, user_name_prettyprint}, + sanitized_url::SanitizedUrl, + types::{MessageDeleteReason, ReviewCallbackData, UrlDesignation}, + CONTROL_CHAT_ID, REVIEW_LOG_CHANNEL_ID, +}; + +/// Check if this user is in the control chat and can do reviews, and +/// delay their requests if appropriate. +pub async fn authenticate_control(bot: &Bot, user: &User) -> Result { + let control = bot + .get_chat_member(CONTROL_CHAT_ID, user.id) + .await? + .is_present(); + if !control { + let username = user_name_prettyprint(user, true); + log::info!("Unauthorized user trying to access reviews: {username}"); + + // Not a member. + // + // Now, facts: + // 1. This function will only be run in context of a private chat. + // + // 2. Teloxide intentionally processes messages from one chat not-concurrently; that is, if + // we delay now, this will delay processing all following direct messages sent by that + // person to this bot. + // + // 3. There is no pertinent reason to DM this bot other than to get the help message or for + // authenticated user's purposes. + // + // 4. If a user is sending DMs to this bot, that means that they have already sent + // `/start`, and hence have already seen the help message. + // + // 5. Therefore, there is no harm to be done by delaying users not legible for reviews for + // DMs. + // + // 6. Bad actors may want to try and spam this bot `/review` to cause it to send the above + // API request many times and in turn get rate limited by telegram. + // + // With that in mind, delay this user from accessing this bot for 5 seconds. + tokio::time::sleep(std::time::Duration::from_secs(5)).await; + } + Ok(control) +} + +/// Convenience function over [`authenticate_control`]. +pub async fn authenticate_control_of_sender( + bot: &Bot, + message: &Message, +) -> Result { + if message.chat.id == CONTROL_CHAT_ID { + return Ok(true); + } + + let Some(sender) = &message.from else { + return Ok(false); + }; + + authenticate_control(bot, sender).await +} + +/// Delete the message and maybe say in the chat that it was deleted, if they don't have +/// `/hide_deletes` enabled. +/// +/// # Panics +/// +/// Panics if the database dies lol +pub async fn delete_message_as_spam_raw( + bot: &Bot, + database: &Database, + chat_id: ChatId, + message_id: MessageId, + album_id: Option<&MediaGroupId>, + sender_name: &str, + reason: MessageDeleteReason, +) -> Result<(), RequestError> { + match teloxide_retry!(bot.delete_message(chat_id, message_id).await) { + Ok(_) => { + if let Some(album_id) = album_id { + // Not *too* important if this fails lol + let _ = database.set_last_deleted_album_id(chat_id, album_id).await; + } + + // Now we shall notify. Should we show the reason? + if let Some(reason) = reason.to_str() { + // We should. Do we *need* to? + let deletes_hidden = !database + .get_hide_deletes(chat_id) + .await + .expect("Database died!"); + + if !deletes_hidden { + // We should. + + // Message or album? + let deleted_thing_type = if album_id.is_some() { + "an album" + } else { + "a message" + }; + + bot.archsendmsg_no_link_preview( + chat_id, + format!( + "Deleted {} from {} {}.", + deleted_thing_type, + encode_text(sender_name), + reason + ) + .as_str(), + None, + ) + .await?; + } + } + Ok(()) + } + Err(RequestError::Api(ApiError::MessageIdInvalid | ApiError::MessageToDeleteNotFound)) => { + // Someone else probably has already deleted it. That's fine. + Ok(()) + } + + Err(RequestError::Api(ApiError::MessageCantBeDeleted)) => { + // No rights? Older than 48 hours? + bot.archsendmsg_no_link_preview( + chat_id, + concat!( + "Tried to delete a spam message, but failed. ", + "This might be because this bot is not an admin with ability to ", + "delete messages, or the message is older than 48 hours.", + ), + None, + ) + .await?; + Ok(()) + } + Err(e) => Err(e), + } +} + +/// Convenience function around [`delete_message_as_spam_raw`]. +pub async fn delete_message_as_spam( + bot: &Bot, + database: &Database, + message: &Message, + sender_name: Option<&str>, + reason: MessageDeleteReason, +) -> Result<(), RequestError> { + let mut tmp = String::new(); + let sender_name = sender_name.unwrap_or_else(|| { + tmp = sender_name_prettyprint(message, false); + &tmp + }); + + delete_message_as_spam_raw( + bot, + database, + message.chat.id, + message.id, + message.media_group_id(), + sender_name, + reason, + ) + .await +} + +/// Inserts/updates this URL in the database with incoming info, logs if necessary, and removes +/// entries from the review queue. +/// +/// Set `review_name` to [`None`] to indicate that this review has been done automatically by the +/// spam checker. +/// +/// If the result is [`InsertOrUpdateResult::NoChange`], no logging is done, but entries in review +/// queue will still be deleted. +pub async fn insert_or_update_url_with_log( + bot: &Bot, + database: &Database, + reviewer_name: Option<&str>, + sanitized_url: &SanitizedUrl, + original_url: &Url, + designation: UrlDesignation, +) -> Result { + let manually_reviewed = reviewer_name.is_some(); + let reviewer_name = reviewer_name.unwrap_or("[AUTO] Spam checker"); + + let result = database + .insert_or_update_url(sanitized_url, original_url, designation, manually_reviewed) + .await + .expect("Database died!"); + + if let InsertOrUpdateResult::NoChange { .. } = result { + // We did nothing. Not worth logging. + return Ok(result); + } + + // Change was enacted. Log it. + + let command = match designation { + UrlDesignation::Spam => "/mark_spam", + UrlDesignation::NotSpam => "/mark_not_spam", + UrlDesignation::Aggregator => "/mark_aggregator", + }; + + let mut log_message = format!("{reviewer_name}\n{command}\n{sanitized_url}"); + if sanitized_url.as_str() != original_url.as_str() { + log_message.push_str("\n\nOriginal URL: "); + log_message.push_str(original_url.as_str()); + } + + // Now to discard URLs on review matching this new rule. + // This requires also discarding all keyboards and sightings pertaining it. + + while let Some(review_entry_id) = database + .find_one_matching_review_queue_entry(result.id()) + .await + .expect("Database died!") + { + // After deleting keyboards/sightings but before deleting the review URL, + // someone might add a new keyboard/sighting, and fail deleting it. + // So, basically, retry until we get it lol + loop { + // Discard review keyboards. + let mut keyboards = database.pop_review_keyboards(review_entry_id); + while let Some((chat_id, message_id)) = + keyboards.try_next().await.expect("Database died!") + { + // No biggie if this fails. + let _ = discard_review_keyboard( + bot, + chat_id, + message_id, + reviewer_name, + designation, + sanitized_url, + ) + .await; + } + + let mut sightings = database.pop_review_link_sightings(review_entry_id); + while let Some((chat_id, message_id, sender_name)) = + sightings.try_next().await.expect("Database died!") + { + if designation == UrlDesignation::Spam { + // Not *really* a biggie if this fails. + let _ = delete_message_as_spam_raw( + bot, + database, + chat_id, + message_id, + None, + &sender_name, + MessageDeleteReason::ContainsSpamLink, + ) + .await; + } + } + + // And now try getting rid of it. + match database.delete_from_review(review_entry_id).await { + Err(sqlx::Error::Database(e)) + if e.kind() == sqlx::error::ErrorKind::ForeignKeyViolation => + { + // Oops! New review/keyboard was made. + // Loop over and remove it. + } + Ok(()) => break, + Err(e) => panic!("Database died!: {e:?}"), + } + } + } + + bot.archsendmsg_no_link_preview(REVIEW_LOG_CHANNEL_ID, log_message.as_str(), None) + .await?; + + Ok(result) +} + +/// Make a header in control chat. describing upcoming review keyboards. Should typically be +/// followed with messages containing said review keyboards. +/// +/// `reported` message is the one with the spam links in question. +/// +/// `reporting` message is the one that initiated this spam report; typically starts with "/spam" +/// or such. +/// +/// For the above two parameters, the same message can be passed twice. +/// +/// `sender_name` should be formatted from the `reporting` message, or set to [`None`] if this review is +/// initiated automatically by the spam checker. +pub async fn send_review_header( + bot: &Bot, + reported: &Message, + reporting: &Message, + sender_name: Option<&str>, +) -> Result<(), RequestError> { + let sender_name = sender_name.unwrap_or("automatic check"); + let chat_name = chat_name_prettyprint(&reporting.chat, true); + + let notify_text = + format!("New link(s) were added to review pool by {sender_name} in {chat_name}"); + + let same_chat = reported.chat.id == reporting.chat.id; + let same_message = same_chat && reported.id == reporting.id; + + if reporting.chat.id != CONTROL_CHAT_ID { + // Forward the relevant message(s) first, if they're not in control chat already. + // + // It's a good nicety, but it could fail: the messages may be protected from forwarding, or + // an admin might have deleted them in just the right moment, or telegram goes funny again. + // So, honestly, just ignore it failing. + if same_message { + // Just forward it. + let _ = bot + .forward_message(CONTROL_CHAT_ID, reporting.chat.id, reporting.id) + .await; + } else if same_chat { + // In the same chat. Forward them both with this call. + let _ = bot + .forward_messages( + CONTROL_CHAT_ID, + reporting.chat.id, + [reported.id, reporting.id], + ) + .await; + } else { + // Two different messages in two different chats. Forward individually. + let _ = bot + .forward_message(CONTROL_CHAT_ID, reported.chat.id, reported.id) + .await; + let _ = bot + .forward_message(CONTROL_CHAT_ID, reporting.chat.id, reporting.id) + .await; + } + } + + if teloxide_retry!( + bot.send_message(CONTROL_CHAT_ID, ¬ify_text) + .parse_mode(teloxide::types::ParseMode::Html) + .await + ) + .is_err() + { + log::error!("Failed notifying control chat of new marked sus link!\n{notify_text}"); + } + + Ok(()) +} + +/// Assuming `review_entry_id`, `sanitized_url` and `original_url` correspond to a review queue entry, +/// edits the message specified by `chat_id` and `message_id` into a new review keyboard. +pub async fn edit_message_into_a_review_keyboard( + bot: &Bot, + chat_id: ChatId, + message_id: MessageId, + review_entry_id: i64, + sanitized_url: &SanitizedUrl, + original_url: &Url, + database: &Database, +) -> Result<(), RequestError> { + let best_match = database + .get_url_full(sanitized_url) + .await + .expect("Database died!"); + + let best_match_url = best_match.as_ref().map(UrlInfoFull::sanitized_url); + + let (text, buttons) = ReviewCallbackData::produce_review_keyboard_text_buttons( + review_entry_id, + sanitized_url, + original_url, + best_match_url, + ); + + let edit_result = bot + .edit_message_text(chat_id, message_id, text) + .parse_mode(teloxide::types::ParseMode::Html) + .reply_markup(buttons) + .disable_link_preview(true) + .await; + + // If we get this error, that means that the message was modified to the + // exact same thing as it was before. This means we're getting the same thing. + if let Err(RequestError::Api(ApiError::MessageNotModified)) = edit_result { + bot.edit_message_text(chat_id, message_id, "There are no more URLs to review.") + .reply_markup(InlineKeyboardMarkup { + inline_keyboard: Vec::new(), + }) + .await?; + return Ok(()); + } + + edit_result?; + + database + .review_keyboard_made(chat_id, message_id, review_entry_id) + .await + .expect("Database died!"); + + Ok(()) +} + +/// Fetches one URL from the review queue and edits the message specified by `chat_id` and +/// `message_id` into a new review keyboard. +pub async fn edit_message_into_a_new_review_keyboard( + bot: &Bot, + chat_id: ChatId, + message_id: MessageId, + database: &Database, +) -> Result<(), RequestError> { + let Some((review_entry_id, sanitized_url, original_url)) = + database.get_url_for_review().await.expect("Database died!") + else { + bot.edit_message_text(chat_id, message_id, "There are no more URLs to review.") + .reply_markup(InlineKeyboardMarkup { + inline_keyboard: Vec::new(), + }) + .await?; + return Ok(()); + }; + + edit_message_into_a_review_keyboard( + bot, + chat_id, + message_id, + review_entry_id, + &sanitized_url, + &original_url, + database, + ) + .await +} + +/// Assuming `review_entry_id`, `sanitized_url` and `original_url` correspond to a review queue entry, +/// creates a review keyboard in specified `chat_id`, optionally replying to a message. +pub async fn send_review_keyboard( + bot: &Bot, + chat_id: ChatId, + reply_to_message_id: Option, + review_entry_id: i64, + sanitized_url: &SanitizedUrl, + original_url: &Url, + database: &Database, +) -> Result { + let best_match = database + .get_url_full(sanitized_url) + .await + .expect("Database died!"); + + let best_match_url = best_match.as_ref().map(UrlInfoFull::sanitized_url); + + let (text, buttons) = ReviewCallbackData::produce_review_keyboard_text_buttons( + review_entry_id, + sanitized_url, + original_url, + best_match_url, + ); + + let mut request = bot + .send_message(chat_id, text) + .parse_mode(teloxide::types::ParseMode::Html) + .reply_markup(buttons) + .disable_link_preview(true); + + if let Some(reply_to) = reply_to_message_id { + request = request.reply_to(reply_to); + } + + let result = request.await; + + if let Ok(message) = result.as_ref() { + database + .review_keyboard_made(message.chat.id, message.id, review_entry_id) + .await + .expect("Database died!"); + } + + result +} + +/// Fetches one URL from the review queue and creates a review keyboard in specified `chat_id`, +/// optionally replying to a message. +pub async fn send_new_review_keyboard( + bot: &Bot, + chat_id: ChatId, + reply_to_message_id: Option, + database: &Database, +) -> Result { + if let Some((review_entry_id, sanitized_url, original_url)) = + database.get_url_for_review().await.expect("Database died!") + { + send_review_keyboard( + bot, + chat_id, + reply_to_message_id, + review_entry_id, + &sanitized_url, + &original_url, + database, + ) + .await + } else { + let mut request = bot.send_message(chat_id, "There are no more URLs to review."); + if let Some(reply_to) = reply_to_message_id { + request = request.reply_to(reply_to); + } + + request.await + } +} + +/// Assuming the `chat_id` and `message_id` point at a review keyboard with given `sanitized_url`, +/// discard it as handled by a user of name `handled_by_name`. +pub async fn discard_review_keyboard( + bot: &Bot, + chat_id: ChatId, + message_id: MessageId, + handled_by_name: &str, + designation: UrlDesignation, + sanitized_url: &SanitizedUrl, +) -> Result<(), RequestError> { + let text = format!("Handled by {handled_by_name}:\n{designation}\n{sanitized_url}",); + + bot.edit_message_text(chat_id, message_id, text) + .parse_mode(teloxide::types::ParseMode::Html) + .reply_markup(InlineKeyboardMarkup { + inline_keyboard: Vec::new(), + }) + .disable_link_preview(true) + .await?; + + Ok(()) +} + +/// Launches an ever-running loop that reminds people in the control chat every 24 hours about +/// unreviewed URLs. +pub async fn remind_about_reviews_spinloop(bot: Bot, database: std::sync::Weak) { + use tokio::time::{sleep, Duration}; + loop { + let Some(database) = database.upgrade() else { + // No more database! + return; + }; + + let review_count = match database.get_review_count().await { + Ok(r) => r, + Err(e) => { + // Database died! + log::error!("Database error! {e:?}"); + return; + } + }; + + if review_count > 0 { + // No biggie if this fails, honestly. + let _ = teloxide_retry!( + bot.send_message( + CONTROL_CHAT_ID, + format!( + concat!( + "There are {} URLs awaiting review. ", + "DM this bot /review to review." + ), + review_count + ) + ) + .await + ); + } + + // Drop the upgraded database. + drop(database); + // Sleep for a day lol + sleep(Duration::from_hours(24)).await; + } +} diff --git a/anti_nft_spam_bot/src/database/mod.rs b/anti_nft_spam_bot/src/database/mod.rs index 48ea504..b7228c4 100644 --- a/anti_nft_spam_bot/src/database/mod.rs +++ b/anti_nft_spam_bot/src/database/mod.rs @@ -1,48 +1,38 @@ use std::{ - collections::HashSet, str::FromStr, sync::{atomic::AtomicBool, Arc}, }; use chrono::Utc; -pub use sqlx::Error; +use futures_util::{stream::BoxStream, TryStreamExt}; use sqlx::{ error::ErrorKind, migrate::MigrateDatabase, - sqlite::{SqliteConnectOptions, SqlitePoolOptions, SqliteRow}, - Executor, Row, Sqlite, + sqlite::{Sqlite, SqliteConnectOptions, SqlitePoolOptions, SqliteRow}, + Executor, Row, Transaction, }; -use teloxide::types::{ChatId, Message, MessageId}; -use tokio::sync::{Mutex, Notify}; +use teloxide::types::{ChatId, MediaGroupId, MessageId}; use url::Url; -use crate::{ - parse_url_like_telegram, sender_name_prettyprint, - spam_checker::SPAM_CHECKER_VERSION, - types::{MarkSusResult, ReviewResponse}, -}; +use crate::{misc::parse_url_like_telegram, sanitized_url::SanitizedUrl, types::UrlDesignation}; + +pub use sqlx::Error; -use super::types::{Domain, IsSpam}; +mod types; +pub use types::*; type Pool = sqlx::Pool; -const DB_PATH: &str = "sqlite:spam_domains.sqlite"; +const DB_PATH: &str = "sqlite:anti_nft_spam_bot.sqlite"; static WAS_CONSTRUCTED: AtomicBool = AtomicBool::new(false); +/// The database. pub struct Database { + /// The connection pool. pool: Pool, - // Mutexes are bad. However, this will only be used for reviews, - // which can only be done by a few people in the control chat. - review_lock: Mutex<()>, - /// A list of domains that are currently being visited by other tasks. - /// For the reason the Mutex is used, see code of [`Self::domain_visit_debounce`] - // I'd make this a std::sync::Mutex but Rust incorrectly assumes it lives after - // the drop and doesn't let it compile lol - domains_currently_being_visited: Mutex>, - /// A [`Notify`] used to wake up tasks waiting on other tasks to visit some domain. - domains_visit_notify: Notify, } impl Database { + /// Create a new database. pub fn new() -> impl std::future::Future, Error>> + Send { Self::new_by_path(DB_PATH, true) } @@ -71,1206 +61,1375 @@ impl Database { .max_connections(32) .connect_with( SqliteConnectOptions::from_str(path) - .unwrap() + .expect("SQLite connect options should be valid") .pragma("cache_size", "-32768") + .foreign_keys(true) // Already default, but doesn't hurt being explicit. .busy_timeout(std::time::Duration::from_secs(600)), ) .await?; - // Do some init. Create the tables... - - // DOMAINS: - // domain (unique primary key, string) - // example_url (string) - // is_spam (0 for no, 1 for yes, 2 for unknown and needs review) - // (2 should NOT usually happen here, but eh) - // last_sent_to_review (date+time in UTC timezone in ISO 8601 format) + // URLS: + // id (i64, unique primary key) + // host (text, a host of a `SanitizedUrl`) + // path (text, a path of a `SanitizedUrl`) + // query (text, a query of a `SanitizedUrl`, can be empty to mean no query) + // param_count (i64, amount of params in the URL query (stored in `url_params`); for example,"?a&b=50&c" is 3 params) + // original_url (text, full original URL with no lowercasing) + // designation (u8, representing a value in the enum `UrlDesignation`) // manually_reviewed (0 for no, 1 for yes) - // spam_checker_version (version of this program this was determined at) - pool.execute(sqlx::query( - " - CREATE TABLE IF NOT EXISTS domains ( - domain TEXT PRIMARY KEY NOT NULL COLLATE NOCASE, - example_url TEXT NULL, - is_spam INTEGER NOT NULL, - last_sent_to_review TEXT NULL, - manually_reviewed INTEGER NOT NULL DEFAULT 0, - spam_checker_version INTEGER NOT NULL DEFAULT 0 - ) STRICT;", - )) + pool.execute( + "CREATE TABLE IF NOT EXISTS urls ( + id INTEGER PRIMARY KEY NOT NULL, + host TEXT NOT NULL, + path TEXT NOT NULL CHECK (SUBSTR(path, 1, 1)='/'), + query TEXT NOT NULL, + param_count INTEGER NOT NULL, + original_url TEXT NOT NULL, + designation INTEGER NOT NULL, + manually_reviewed INTEGER NOT NULL, + UNIQUE (host, path, query) + ) STRICT;", + ) .await?; - // URLS: - // url (unique primary key, string) - // is_spam (0 for no, 1 for yes, 2 for unknown and needs review) - // last_sent_to_review (date+time in UTC timezone in ISO 8601 format) - // manually_reviewed (0 for no, 1 for yes) - // spam_checker_version (version of this program this was determined at) - pool.execute(sqlx::query( - " - CREATE TABLE IF NOT EXISTS urls ( - url TEXT PRIMARY KEY NOT NULL COLLATE NOCASE, - is_spam INTEGER NOT NULL, - last_sent_to_review TEXT NULL, - manually_reviewed INTEGER NOT NULL DEFAULT 0, - spam_checker_version INTEGER NOT NULL DEFAULT 0 - ) STRICT;", - )) + // URL_PARAMS: + // url_id (i64, references `id` of table `urls`) + // param (text, a percent-encoded URL param itself; for example, "v=dQw4w9WgXcQ") + pool.execute( + "CREATE TABLE IF NOT EXISTS url_params ( + url_id INTEGER NOT NULL, + param TEXT NOT NULL, + UNIQUE (url_id, param), + FOREIGN KEY (url_id) REFERENCES urls(id) + ) STRICT;", + ) .await?; // HIDE_DELETES: - // An admin of chats listed here asked to hide + // Admins of chats listed here asked to hide // bot's notifications about deleting a message. - // chatid (unique primary key, i64) + // chat_id (unique primary key, i64) pool.execute(sqlx::query( - " - CREATE TABLE IF NOT EXISTS hide_deletes ( - chatid INTEGER PRIMARY KEY NOT NULL - ) STRICT;", + "CREATE TABLE IF NOT EXISTS hide_deletes ( + chat_id INTEGER PRIMARY KEY NOT NULL + ) STRICT;", + )) + .await?; + + // LAST_DELETED_ALBUM_ID: + // chat_id (unique primary key, i64) + // media_group_id (text, album ID of the last deleted message) + pool.execute(sqlx::query( + "CREATE TABLE IF NOT EXISTS last_deleted_album_id ( + chat_id INTEGER PRIMARY KEY NOT NULL, + media_group_id TEXT NOT NULL + ) STRICT;", )) .await?; + // review_queue: + // id (i64, unique primary key) + // sanitized_url (text, full `SanitizedUrl`) + // original_url (text, full original URL with no lowercasing) + // last_sent_to_review (date+time in UTC timezone in ISO 8601 format) + pool.execute( + "CREATE TABLE IF NOT EXISTS review_queue ( + id INTEGER PRIMARY KEY NOT NULL, + sanitized_url TEXT NOT NULL UNIQUE, + original_url TEXT NOT NULL, + last_sent_to_review TEXT NULL, + UNIQUE (sanitized_url) + ) STRICT;", + ) + .await?; + // SUS_LINK_SIGHTINGS: - // List of messages where a link marked as sus was sighted. + // List of messages where a link on review was sighted. // Used to delete all of them if the link is marked as spam. - // chatid (i64) - // messageid (i32 (because telegram bot api is just like that)) - // sendername (text) - // urlid (i64, references rowid of table `urls`) + // chat_id (i64) + // message_id (i32 (because telegram bot api is just like that)) + // sender_name (text) + // url_id (i64, references rowid of table `review_queue`) pool.execute(sqlx::query( - " - CREATE TABLE IF NOT EXISTS sus_link_sightings ( - chatid INTEGER NOT NULL, - messageid INTEGER NOT NULL, - sendername TEXT NOT NULL, - urlid INTEGER NOT NULL, - UNIQUE (chatid, messageid) + "CREATE TABLE IF NOT EXISTS sus_link_sightings ( + chat_id INTEGER NOT NULL, + message_id INTEGER NOT NULL, + sender_name TEXT NOT NULL, + url_id INTEGER NOT NULL, + UNIQUE (chat_id, message_id, url_id), + FOREIGN KEY (url_id) REFERENCES review_queue(id) ) STRICT;", )) .await?; - // Transparent database migration lololol - // Will fail harmlessly if the column already exists. - let _ = sqlx::query( - "ALTER TABLE domains - ADD COLUMN manually_reviewed INTEGER NOT NULL DEFAULT 0;", - ) - .execute(&pool) - .await; - let _ = sqlx::query( - "ALTER TABLE domains - ADD COLUMN spam_checker_version INTEGER NOT NULL DEFAULT 0;", - ) - .execute(&pool) - .await; - let _ = sqlx::query( - "ALTER TABLE urls - ADD COLUMN spam_checker_version INTEGER NOT NULL DEFAULT 0;", - ) - .execute(&pool) - .await; + // REVIEW_KEYBOARDS: + // chat_id (i64) + // message_id (i32 (because telegram bot api is just like that)) + // url_id (i64, references rowid of table `review_queue`) + pool.execute(sqlx::query( + "CREATE TABLE IF NOT EXISTS review_keyboards ( + chat_id INTEGER NOT NULL, + message_id INTEGER NOT NULL, + url_id INTEGER NOT NULL, + UNIQUE (chat_id, message_id), + FOREIGN KEY (url_id) REFERENCES review_queue(id) + ) STRICT;", + )) + .await?; - let _ = sqlx::query("ALTER TABLE domains DROP COLUMN from_spam_list;") - .execute(&pool) + // Two automated indices suggested by sqlite3_expert + let _ = pool + .execute(sqlx::query( + "CREATE INDEX url_params_idx ON url_params(param);", + )) .await; - let _ = sqlx::query("ALTER TABLE urls DROP COLUMN from_spam_list;") - .execute(&pool) + let _ = pool + .execute(sqlx::query( + "CREATE INDEX urls_host_path_id_idx ON urls(host, path, id DESC);", + )) .await; - let db_arc = Arc::new(Database { - pool, - review_lock: Mutex::new(()), - domains_currently_being_visited: Mutex::new(HashSet::with_capacity(4)), - domains_visit_notify: Notify::new(), - }); - - Ok(db_arc) + Ok(Arc::new(Database { pool })) } - /// Check if a domain is a spam domain or not, according to the database. - /// Returns [`None`] if it's not in the database. - /// - /// Note that [`Self::is_url_spam`] should take priority over this, - /// unless its return result is [`IsSpam::Maybe`]. + /// If the input URL has no query, provide empty string for that argument. /// - /// "No" and "Maybe" results that were automatically determined by - /// an old spam checker are ignored unless `return_old_checker_results` is set to true. + /// # Panics /// - /// Also returns an example URL stored with the database, and a - /// boolean that is true if this result is manually reviewed. - pub async fn is_domain_spam( - &self, - domain: &Domain, - return_old_checker_results: bool, - ) -> Result, bool)>, Error> { - // The "NOT" condition is to exclude results that says anything other than `IsSpam::Yes` - // and are automatically determined by an older spam check version. - // We DON'T want to delete those, because they should still be useful for review. - sqlx::query( - "SELECT is_spam, example_url, manually_reviewed FROM domains - WHERE domain=? AND NOT (is_spam!=1 AND spam_checker_version, + host: &str, + path: &str, + query: &str, + ) -> Result, Error> { + assert!( + path.starts_with('/'), + "Provided path must correspond to one of a URL" + ); + + let Some(row) = sqlx::query( + "SELECT id, param_count, designation, manually_reviewed FROM urls + WHERE host=? AND path=? AND query=?;", ) - .bind(domain.as_str()) - .bind(if return_old_checker_results { - 0 - } else { - SPAM_CHECKER_VERSION - }) - .map(|row: SqliteRow| { - ( - IsSpam::from(row.get::("is_spam")), - row.get::, _>("example_url") - .map(|x| Url::parse(&x).unwrap()), - row.get::("manually_reviewed"), - ) - }) - .fetch_optional(&self.pool) - .await + .bind(host) + .bind(path) + .bind(query) + .fetch_optional(executor) + .await? + else { + return Ok(None); + }; + let id: i64 = row.get(0); + let param_count: i64 = row.get(1); + let designation = UrlDesignation::try_from(row.get::(2)) + .expect("Invalid URL designation found in database!"); + let manually_reviewed: bool = row.get(3); + + Ok(Some(UrlInfoShort { + id, + param_count, + designation, + manually_reviewed, + })) } - /// Check if a URL is a spam URL or not, according to the database. - /// Returns [`None`] if it's not in the database. - /// - /// Note that this should take priority over [`Self::is_domain_spam`], - /// unless this function's return result is [`IsSpam::Maybe`]. + /// # Panics /// - /// "No" and "Maybe" results that were automatically determined by - /// an old spam checker are ignored unless `return_old_checker_results` is set to true. - /// - /// Also returns a boolean that is true if this result is manually reviewed. - pub async fn is_url_spam( - &self, - url: &Url, - return_old_checker_results: bool, - ) -> Result, Error> { - // The "NOT" condition is to exclude results that says anything other than `IsSpam::Yes` - // and are automatically determined by an older spam check version. - // We DON'T want to delete those, because they should still be useful for review. - sqlx::query( - "SELECT is_spam, manually_reviewed FROM urls - WHERE url=? AND NOT (is_spam!=1 AND spam_checker_version( + &'a self, + url: &'a SanitizedUrl, + ) -> impl std::future::Future, Error>> + Send + 'a { + Self::get_url_exact_destructured( + &self.pool, + url.host_str(), + url.path(), + url.query().unwrap_or(""), ) - .bind(url.as_str()) - .bind(if return_old_checker_results { - 0 - } else { - SPAM_CHECKER_VERSION - }) - .map(|row: SqliteRow| { - ( - IsSpam::from(row.get::("is_spam")), - row.get::("manually_reviewed"), - ) - }) - .fetch_optional(&self.pool) - .await } - /// Check if a given URL (or its domain) is spam or not, according to the database. - /// Convenience method for [`Self::is_domain_spam`] and [`Self::is_url_spam`] - /// Returns [`None`] if it's not in the database. - /// - /// Argument `domain` is optional and, if `url` check is indecisive, - /// is used if provided, or extracted from URL if not. + /// # Panics /// - /// "No" and "Maybe" results that were automatically determined by - /// an old spam checker are ignored unless `return_old_checker_results` is set to true. - /// - /// Also returns a boolean that is true if this result is manually reviewed. - pub async fn is_spam( + /// Panics if an invalid [`UrlDesignation`] is found in the database. + async fn get_url_inexact_with_query( &self, - url: &Url, - domain: impl Into>, - return_old_checker_results: bool, - ) -> Result, Error> { - let mut domain = domain.into(); - // Look for URL match... - let url_result = self.is_url_spam(url, return_old_checker_results).await?; - - if let Some((IsSpam::Yes, _)) = url_result { - return Ok(url_result); + url: &SanitizedUrl, + ) -> Result, Error> { + // The query is needed a bit later in the code, but it's a good idea to let `.expect` panic + // here early if needed. + let query = url + .query() + .expect("This function expects URLs with query to be passed"); + + // Try to find an exact match first real quick? + if let Some(exact_match) = self.get_url_exact(url).await? { + return Ok(Some(exact_match)); } - // If no provided domain, try to get one from the URL. - // Otherwise, use provided domain, to not do an extraneous allocation. - let domain_inner; - if domain.is_none() { - domain_inner = Domain::from_url(url); - domain = domain_inner.as_ref(); - } + let params = query.split('&'); + let param_count = params.clone().count(); + + // The idea behind this query is: + // 1. Inner join urls with params + // 2. Filter by host and path. + // 3. If `exact` function argument is true, filter by exact amount of parameter count too. + // 4. Filter params to only those that appear in input URL. + // 5. Group params by URL ID; COUNT(*) is now the amount of matched params per matched URL + // 6. Filter URLs for which the amount of matching params is not equal to amount of + // total params that URL has. This excludes URLs with more params than input. + // 7. Order filtered URLs by param count; the one matching the most wins. + // If there's a conflict... ¯\_ (ツ)_/¯ + + let sql_query_str = { + // SQLx doesn't do array inserts of any kinds yet, so this is the best we can do for + // now with SQLite, aside from maybe making a temp table and inserting each param into + // it. + + let sql_query_template = " + SELECT + urls.id, + urls.param_count, + urls.designation, + urls.manually_reviewed + FROM + url_params, + urls + WHERE + urls.id=url_params.url_id AND + urls.host=$1 AND + urls.path=$2 AND + param IN (!!!THE_PARAMS!!!) + GROUP BY urls.id HAVING COUNT(*) == urls.param_count + ORDER BY urls.param_count DESC + LIMIT 1;"; + + // We want to replace "!!!THE_PARAMS!!!" with something like "$3, $4, $5", with one + // number for each param. + + let (pre_params, post_params) = sql_query_template + .split_once("!!!THE_PARAMS!!!") + .expect("The params must exist in the string"); + + let mut sql_query = String::with_capacity(sql_query_template.len()); + sql_query.push_str(pre_params); + + let mut pushed_a_param = false; + for i in 0..param_count { + use std::fmt::Write; + + if pushed_a_param { + sql_query.push(','); + } + write!(sql_query, "${}", i + 3).expect("Writing to a String never fails"); + pushed_a_param = true; + } - // Look for domain match... - let domain_result = if let Some(domain) = domain { - self.is_domain_spam(domain, return_old_checker_results) - .await? - } else { - None - }; + sql_query.push_str(post_params); - // Pick the most condemning one. - let most_condeming = IsSpam::pick_most_condemning( - url_result.map(|x| x.0), - domain_result.as_ref().map(|x| x.0), - ); + sql_query + }; - if let Some(most_condeming) = most_condeming { - let manually_reviewed = if most_condeming.1 { - domain_result.unwrap().2 - } else { - url_result.unwrap().1 - }; + let mut sql_query = sqlx::query(&sql_query_str) + .bind(url.as_ref().host_str()) + .bind(url.as_ref().path()); - Ok(Some((most_condeming.0, manually_reviewed))) - } else { - Ok(None) + for param in params { + sql_query = sql_query.bind(param); } - } - - /// Inserts a domain into the database and tags it as spam or not. - /// Overwrites the domain if it already exists. - pub async fn add_domain( - &self, - domain: &Domain, - example_url: impl Into>, - is_spam: IsSpam, - manually_reviewed: bool, - ) -> Result<(), Error> { - let example_url = example_url.into(); - sqlx::query( - "INSERT INTO domains( - domain, - example_url, - is_spam, - manually_reviewed, - spam_checker_version) - VALUES (?, ?, ?, ?, ?) - ON CONFLICT DO UPDATE SET - example_url=COALESCE(?, example_url), - is_spam=?, - manually_reviewed=?, - spam_checker_version=?;", - ) - .bind(domain.as_str()) - .bind(example_url.map(Url::as_str)) - .bind::(is_spam.into()) - .bind(manually_reviewed) - .bind(SPAM_CHECKER_VERSION) - // On conflict... - .bind(example_url.map(Url::as_str)) - .bind::(is_spam.into()) - .bind(manually_reviewed) - .bind(SPAM_CHECKER_VERSION) - .execute(&self.pool) - .await?; - if let Some(url) = example_url { - // If we know for a fact that this URL and its domain is - // spam, we don't need an entry in the `urls` table for it. - if is_spam == IsSpam::Yes { - sqlx::query("DELETE FROM urls WHERE url=? AND is_spam=?;") - .bind(url.as_str()) - .bind::(is_spam.into()) - .execute(&self.pool) - .await?; - } - // And delete all sightings. - // If this was marked as spam, the sightings would have already been processed. - // This here is mostly just to catch stragglers that could have gotten - // in after they were, but before the link was marked here. - self.delete_all_sightings_of(url).await?; - } + let Some(row) = sql_query.fetch_optional(&self.pool).await? else { + return Ok(None); + }; - Ok(()) + let id: i64 = row.get(0); + let param_count: i64 = row.get(1); + let designation = UrlDesignation::try_from(row.get::(2)) + .expect("Invalid URL designation found in database!"); + let manually_reviewed: bool = row.get(3); + + Ok(Some(UrlInfoShort { + id, + param_count, + designation, + manually_reviewed, + })) } - /// Inserts a URL into the database and tags it as spam or not. - /// Overwrites the URL if it already exists. - pub async fn add_url( + async fn get_url_inexact_assuming_no_query( &self, - url: &Url, - is_spam: IsSpam, - manually_reviewed: bool, - ) -> Result<(), Error> { - sqlx::query( - "INSERT INTO urls( - url, - is_spam, - manually_reviewed, - spam_checker_version) - VALUES (?, ?, ?, ?) - ON CONFLICT DO UPDATE SET - is_spam=?, - manually_reviewed=?, - spam_checker_version=?;", - ) - .bind(url.as_str()) - .bind::(is_spam.into()) - .bind(manually_reviewed) - .bind(SPAM_CHECKER_VERSION) - // On conflict... - .bind::(is_spam.into()) - .bind(manually_reviewed) - .bind(SPAM_CHECKER_VERSION) - .execute(&self.pool) - .await?; - - // And delete all sightings. - // If this was marked as spam, the sightings would have already been processed. - // This here is mostly just to catch stragglers that could have gotten - // in after they were, but before the link was marked here. - self.delete_all_sightings_of(url).await?; + url: &SanitizedUrl, + ) -> Result, Error> { + for (host, path) in url.destructure() { + if let Some(result) = + Self::get_url_exact_destructured(&self.pool, host, path, "").await? + { + return Ok(Some(result)); + } + } - Ok(()) + // Nothing matched. Oops. + Ok(None) } - /// Mark a URL as maybe spam, if it's not already marked as spam - /// and wasn't manually reviewed. Returns true if anything is actually done. - /// - /// Note that this adds a URL entry if one doesn't exist, - /// even if there's a meaningful domain entry. - async fn mark_url_sus(&self, url: &Url) -> Result { - let result = sqlx::query( - " - INSERT INTO urls( - url, - is_spam, - spam_checker_version - ) VALUES (?, 2, ?) - ON CONFLICT DO - UPDATE SET is_spam=2, spam_checker_version=? - WHERE is_spam=0 AND manually_reviewed=0;", - ) - .bind(url.as_str()) - .bind(SPAM_CHECKER_VERSION) - .bind(SPAM_CHECKER_VERSION) - .execute(&self.pool) - .await? - .rows_affected() - > 0; - Ok(result) - } - - /// Convenience function to mark both a URL and its domain as maybe spam. - pub async fn mark_sus( - &self, - url: &Url, - mut domain: Option<&Domain>, - ) -> Result { - // We only want to deal with entries in the database that exist. - - // Check the URL one. - if let Some(is_spam_url) = self.is_url_spam(url, false).await? { - let result = match is_spam_url.0 { - IsSpam::Yes => MarkSusResult::AlreadyMarkedSpam, - IsSpam::Maybe => MarkSusResult::AlreadyMarkedSus, - IsSpam::No => { - let mark_result = !is_spam_url.1 && self.mark_url_sus(url).await?; - if mark_result { - MarkSusResult::Marked - } else { - MarkSusResult::ManuallyReviewedNotSpam - } - } - }; - - return Ok(result); - } - - // If no provided domain, try to get one from the URL. - // Otherwise, use provided domain, to not do an extraneous allocation. - let domain_inner; - if domain.is_none() { - domain_inner = Domain::from_url(url); - domain = domain_inner.as_ref(); - } - if let Some(domain) = domain { - // Check the domain one. - if let Some(is_spam_domain) = self.is_domain_spam(domain, false).await? { - let result = match is_spam_domain.0 { - IsSpam::Yes => MarkSusResult::AlreadyMarkedSpam, - IsSpam::Maybe => MarkSusResult::AlreadyMarkedSus, - IsSpam::No => { - let mark_result = self.mark_url_sus(url).await?; - if mark_result { - MarkSusResult::Marked - } else { - MarkSusResult::ManuallyReviewedNotSpam - } - } - }; - - return Ok(result); + /// Find and get short info for a URL designations entry matching the given URL, if any. + pub async fn get_url(&self, url: &SanitizedUrl) -> Result, Error> { + // Try matching on query. + if url.as_ref().query().is_some() { + if let Some(result) = self.get_url_inexact_with_query(url).await? { + return Ok(Some(result)); } } - // It is in neither URL nor Domain tables. - // Add it in as a URL entry. - self.mark_url_sus(url).await?; - Ok(MarkSusResult::Marked) + // Nothing found or no query. Either way, + self.get_url_inexact_assuming_no_query(url).await } - /// Count and return the amount of links left to review. - pub async fn get_review_count(&self) -> Result { - sqlx::query( - "SELECT SUM(A) FROM - ( - SELECT COUNT(*) AS A FROM urls WHERE is_spam=2 - UNION ALL - SELECT COUNT(*) AS A FROM domains WHERE is_spam=2 - );", - ) - .map(|x: SqliteRow| x.get(0)) - .fetch_one(&self.pool) - .await + /// Find and get short info from the URL designations table for an entry with this ID, if any. + /// + /// # Panics + /// + /// Panics if an invalid [`UrlDesignation`] is found in the database. + #[allow(unused)] // Used in tests, actually. + pub async fn get_url_by_id_short(&self, id: i64) -> Result, Error> { + let Some(row) = + sqlx::query("SELECT param_count, designation, manually_reviewed FROM urls WHERE id=?;") + .bind(id) + .fetch_optional(&self.pool) + .await? + else { + return Ok(None); + }; + let param_count: i64 = row.get(0); + let designation = UrlDesignation::try_from(row.get::(1)) + .expect("Invalid URL designation found in database!"); + let manually_reviewed: bool = row.get(2); + + Ok(Some(UrlInfoShort { + id, + param_count, + designation, + manually_reviewed, + })) } - /// Get a URL, and its database table and ID, for review, and its state in the database. - pub async fn get_url_for_review(&self) -> Result, Error> { - // Get the mutex. It'll be unlocked at the end of the function - // automatically due to RAII. - let _the_mutex = self.review_lock.lock(); - - // We heard you like database queries UwU - let db_result: Option<(Url, IsSpam, i64, bool)> = sqlx::query( - "SELECT * FROM - ( - SELECT url, is_spam, rowid, 1 AS from_urls_table, - manually_reviewed, last_sent_to_review - FROM urls - UNION - SELECT COALESCE(example_url, domain) AS url, is_spam, - rowid, 0 AS from_urls_table, - manually_reviewed, last_sent_to_review - FROM domains - ) - ORDER BY manually_reviewed, is_spam DESC, last_sent_to_review, rowid DESC LIMIT 1;", + /// Find and get full info from the URL designations table for an entry with this ID, if any. + /// + /// # Panics + /// + /// Panics if an invalid [`UrlDesignation`], [`Url`], or [`SanitizedUrl`] is found in the database. + pub async fn get_url_by_id_full(&self, id: i64) -> Result, Error> { + let Some(row) = sqlx::query( + "SELECT + urls.param_count, + urls.original_url, + urls.designation, + urls.manually_reviewed + FROM + urls + WHERE urls.id=?;", ) - .map(|row: SqliteRow| { - ( - parse_url_like_telegram(row.get("url")).expect("Database has invalid URL data!"), - IsSpam::from(row.get::("is_spam")), - row.get::("rowid"), - row.get::("from_urls_table"), - ) - }) + .bind(id) .fetch_optional(&self.pool) - .await?; - - let Some((url, is_spam, rowid, from_urls_table)) = db_result else { - // Well dang. + .await? + else { return Ok(None); }; - // Write the time at which this entry was sent to review... - { - let db_query = if from_urls_table { - "UPDATE urls SET last_sent_to_review=? WHERE rowid=?;" - } else { - "UPDATE domains SET last_sent_to_review=? WHERE rowid=?;" - }; - - // Mark this URL or domain in the database as sent to review. - let time = Utc::now(); + // Extract to variables. + let param_count: i64 = row.get(0); + let original_url: &str = row.get(1); + let designation: u8 = row.get(2); + let manually_reviewed: bool = row.get(3); + + // Now combine to concrete types. + // Sanitized URL can be derived from original URL. + let (sanitized_url, original_url) = SanitizedUrl::from_str_with_original(original_url) + .expect("Invalid URL found in database!"); + let designation = UrlDesignation::try_from(designation) + .expect("Invalid URL designation found in database!"); + + Ok(Some(UrlInfoFull { + short: UrlInfoShort { + id, + param_count, + designation, + manually_reviewed, + }, + sanitized_url, + original_url, + })) + } - sqlx::query(db_query) - .bind(time) - .bind(rowid.to_string()) - .execute(&self.pool) - .await?; + /// Find and get full info for a URL designations entry matching the given URL, if any. + pub async fn get_url_full(&self, url: &SanitizedUrl) -> Result, Error> { + match self.get_url(url).await? { + Some(short) => self.get_url_by_id_full(short.id).await, + None => Ok(None), } - - let table_name = match from_urls_table { - false => "domains", - true => "urls", - }; - - // Pass it on. - Ok(Some((url, table_name, rowid, is_spam))) } - /// Get a URL from a database table name and rowid. - pub async fn get_url_from_table_and_rowid( - &self, - table: &str, - rowid: i64, - ) -> Result)>, Error> { - match table { - "domains" => { - sqlx::query("SELECT domain, example_url FROM domains WHERE rowid=?") - .bind(rowid) - .map(|row: SqliteRow| { - let example_url: Option<&str> = row.get("example_url"); - let example_url = example_url.map(|x| { - parse_url_like_telegram(x).expect("Unparsable example URL in database!") - }); - - let domain: &str = row.get("domain"); - let domain_url = parse_url_like_telegram(domain) - .expect("Unparsable domain as URL in database!"); - let domain = - Domain::from_url(&domain_url).expect("Unparsable domain in database!"); - (example_url.unwrap_or(domain_url), Some(domain)) - }) - .fetch_optional(&self.pool) - .await + /// Returns ID of the inserted URL. + async fn insert_url_unchecked( + transaction: &mut Transaction<'_, Sqlite>, + sanitized_url: &SanitizedUrl, + original_url: &Url, + designation: UrlDesignation, + manually_reviewed: bool, + ) -> Result { + let params = sanitized_url + .as_ref() + .query() + .into_iter() + .flat_map(|x| x.split('&')); + let param_count = params.clone().count(); + + let new_id = sqlx::query( + "INSERT INTO urls ( + host, + path, + query, + param_count, + original_url, + designation, + manually_reviewed) + VALUES (?, ?, ?, ?, ?, ?, ?) + ", + ) + .bind(sanitized_url.host_str()) + .bind(sanitized_url.as_ref().path()) + .bind(sanitized_url.query().unwrap_or("")) + .bind(param_count.cast_signed() as i64) + .bind(original_url.as_str()) + .bind(designation as u8) + .bind(manually_reviewed) + .execute(&mut **transaction) + .await? + .last_insert_rowid(); + + if param_count > 0 { + let mut query = String::from("INSERT INTO url_params(url_id, param) VALUES "); + for i in 0..param_count { + use std::fmt::Write; + write!(query, "({new_id}, ?)").expect("Writing to a String never fails"); + if i != param_count - 1 { + query.push(','); + } } - "urls" => { - sqlx::query("SELECT url FROM urls WHERE rowid=?") - .bind(rowid) - .map(|row: SqliteRow| { - let url: &str = row.get("url"); - let url = - parse_url_like_telegram(url).expect("Unparsable URL in database!"); - - (url, None) - }) - .fetch_optional(&self.pool) - .await + query.push(';'); + + let mut query = sqlx::query(&query); + + for param in params { + query = query.bind(param); } - _ => Ok(None), + + query.execute(&mut **transaction).await?; } - } - /// Remove a domain from the database, if it exists. - #[allow(dead_code)] - pub async fn remove_domain(&self, domain: &Domain) -> Result<(), Error> { - sqlx::query("DELETE FROM domains WHERE domain=?;") - .bind(domain.as_str()) - .execute(&self.pool) - .await?; - Ok(()) + Ok(new_id) } - /// Remove a URL from the database, if it exists. - pub async fn remove_url(&self, url: &Url) -> Result<(), Error> { - sqlx::query("DELETE FROM urls WHERE url=?;") - .bind(url.as_str()) - .execute(&self.pool) + async fn update_url_unchecked( + transaction: &mut Transaction<'_, Sqlite>, + id: i64, + designation: UrlDesignation, + manually_reviewed: bool, + ) -> Result<(), Error> { + sqlx::query("UPDATE urls SET designation=?, manually_reviewed=? WHERE id=?;") + .bind(designation as u8) + .bind(manually_reviewed) + .bind(id) + .execute(&mut **transaction) .await?; + Ok(()) } - pub async fn read_review_response(&self, response: &ReviewResponse) -> Result<(), Error> { - match response { - ReviewResponse::Skip => (), - ReviewResponse::UrlSpam(_domain, url) => { - self.add_url(url, IsSpam::Yes, true).await?; + /// Insert this into the database, or update an existing entry if one exists. + /// + /// You most likely want to call [`crate::actions::insert_or_update_url_with_log`] instead. + /// + /// If there is an entry already present in the database, then it's possible no change is + /// enacted. Specifically, that happens if the new info is not manually reviewed but existing + /// info is (i.e. an automatic review would be overwriting an old one; in this case, a warning + /// is emitted), or if both old and new designation and manual review status match. + pub async fn insert_or_update_url( + &self, + sanitized_url: &SanitizedUrl, + original_url: &Url, + designation: UrlDesignation, + manually_reviewed: bool, + ) -> Result { + // Just try inserting as is! If it fails, we'll get a UNIQUE violation error. + + let mut trans = self.pool.begin().await?; + + let insert_result = Self::insert_url_unchecked( + &mut trans, + sanitized_url, + original_url, + designation, + manually_reviewed, + ) + .await; + + match insert_result { + Ok(new_id) => { + // nice. + trans.commit().await?; + return Ok(InsertOrUpdateResult::Inserted { new_id }); } - ReviewResponse::DomainSpam(domain, url) => { - self.add_domain(domain, Some(url), IsSpam::Yes, true) - .await?; - // Implicitly this means that this specific URL is also spam, - // as part of this domain. - self.remove_url(url).await?; + Err(Error::Database(e)) if e.kind() == ErrorKind::UniqueViolation => { + // An entry exists. Continue to code below. } - ReviewResponse::NotSpam(domain, url) => { - // Neither domain nor URL are spam. - - // Write the result about the URL unconditionally. - self.add_url(url, IsSpam::No, true).await?; - - // If not provided, try to get it from the URL. - let mut domain = domain; - let domain_tmp; - if domain.is_none() { - domain_tmp = Domain::from_url(url); - domain = &domain_tmp; - } + Err(e) => { + // Some other error. Uh-oh! + // Dropping trans rolls the transaction back. + return Err(e); + } + } - if let Some(domain) = &domain { - // Write about the domain even if there's no domain-specific record. - self.add_domain(domain, Some(url), IsSpam::No, true).await?; - } + // If we're here, that means a unique violation has happened i.e. an entry exists. + // ...Find it??? + + let exact_match = Self::get_url_exact_destructured( + &mut *trans, + sanitized_url.host_str(), + sanitized_url.path(), + sanitized_url.query().unwrap_or(""), + ) + .await? + .expect("Quantum state URL detected!! What?!?!"); + + // Ok yes good. Update it? + + if !manually_reviewed && exact_match.manually_reviewed() { + // This will overwrite a manually reviewed entry with an automatically determined + // one. Bad! + log::warn!( + "Automatic review tried to overwrite data on manual review for {}", + sanitized_url.as_str() + ); + return Ok(InsertOrUpdateResult::NoChange { + existing_info: exact_match, + }); + } + + if designation == exact_match.designation() { + // Both old and new have the same designation. At this point, the only change + // that could be enacted is changing the "manually reviewed" flag. + if manually_reviewed == exact_match.manually_reviewed() { + // ...But even that matches. No change can be enacted. + return Ok(InsertOrUpdateResult::NoChange { + existing_info: exact_match, + }); } } - Ok(()) + // Above checks passed. This will enact a change. + Self::update_url_unchecked(&mut trans, exact_match.id, designation, manually_reviewed) + .await?; + + Ok(InsertOrUpdateResult::Updated { + old_info: exact_match, + }) } /// Gets whether or not admins of this chat want the bot to not show /// notifications about deleting a message. - pub async fn get_hide_deletes(&self, chatid: ChatId) -> Result { - sqlx::query("SELECT 1 FROM hide_deletes WHERE chatid=?") - .bind(chatid.0) + /// + /// True if they don't want them to show, false otherwise. + pub async fn get_hide_deletes(&self, chat_id: ChatId) -> Result { + sqlx::query("SELECT 1 FROM hide_deletes WHERE chat_id=?") + .bind(chat_id.0) .fetch_optional(&self.pool) .await .map(|x| x.is_some()) } - /// Sets whether or not admins of this chat want the bot to not show - /// notifications about deleting a message. Returns the previous state. - pub async fn set_hide_deletes(&self, chatid: ChatId, hide: bool) -> Result { - let old_state = self.get_hide_deletes(chatid).await?; - - if old_state == hide { - // It's already set to that. Do nothing, return true. - return Ok(hide); - } - - // Aw, we actually have to do things now :( - + /// Sets whether or not admins of this chat want the bot to not show notifications about + /// deleting a message. Returns the previous state. + pub async fn set_hide_deletes(&self, chat_id: ChatId, hide: bool) -> Result { if hide { sqlx::query( - "INSERT INTO hide_deletes (chatid) + "INSERT INTO hide_deletes (chat_id) VALUES (?) ON CONFLICT DO NOTHING;", ) - .bind(chatid.0) + .bind(chat_id.0) .execute(&self.pool) - .await?; + .await + .map(|x| x.rows_affected() == 0) // If true, this means it was set already. } else { - sqlx::query("DELETE FROM hide_deletes WHERE chatid=?;") - .bind(chatid.0) + sqlx::query("DELETE FROM hide_deletes WHERE chat_id=?;") + .bind(chat_id.0) .execute(&self.pool) - .await?; + .await + .map(|x| x.rows_affected() > 0) // If true, this means it was set previously. } - - Ok(old_state) } - /// Tells the database that this sus link is sighted in this message and with this sender name. + /// Inform the database of the ID of the last album which's message was deleted within a chat. /// - /// If sender name is not provided, it is derived from the message. If a record for this URL - /// exists, it's not inserted; duplicate runs of this function are allowed. - pub async fn sus_link_sighted( + /// Used in conjunction with [`Self::get_last_deleted_album_id`]. + pub async fn set_last_deleted_album_id( &self, - message: &Message, - sendername: Option<&str>, - link: &Url, + chat_id: ChatId, + album_id: &MediaGroupId, ) -> Result<(), Error> { - let sendername_string; - - let sendername = if let Some(sendername) = sendername { - sendername - } else { - sendername_string = sender_name_prettyprint(message, false); - &sendername_string - }; + sqlx::query( + "INSERT INTO last_deleted_album_id (chat_id, media_group_id) + VALUES ($1, $2) + ON CONFLICT DO UPDATE SET media_group_id=$2;", + ) + .bind(chat_id.0) + .bind(&album_id.0) + .execute(&self.pool) + .await?; + Ok(()) + } - // First, if it's so sus, find it in the database. - let Some(rowid) = sqlx::query("SELECT rowid FROM urls WHERE is_spam=2 AND url=?") - .bind(link.as_str()) - .map(|row: SqliteRow| row.get::(0)) + /// Get the last deleted album ID in this chat. Used to delete other messages in the same + /// album. + /// + /// Used in conjunction with [`Self::set_last_deleted_album_id`]. + pub async fn get_last_deleted_album_id( + &self, + chat_id: ChatId, + ) -> Result, Error> { + sqlx::query("SELECT media_group_id FROM last_deleted_album_id WHERE chat_id=? LIMIT 1;") + .bind(chat_id.0) + .map(|row: SqliteRow| MediaGroupId(row.get(0))) .fetch_optional(&self.pool) - .await? - else { - // It's not in the database marked as sus? Huh. Whatever. - return Ok(()); - }; + .await + } - let err = sqlx::query( - "INSERT INTO sus_link_sightings (chatid, messageid, sendername, urlid) - VALUES (?, ?, ?, ?)", + /// Send this URL into the review queue. + pub async fn send_to_review( + &self, + sanitized_url: &SanitizedUrl, + original_url: &Url, + ) -> Result { + // A naive implementation of this method would be: + // + // 1. Check if there's URL already in the database due to which this should be rejected. + // 2. If not, insert into review queue. + // + // But if other code inserts a new URL inbetween these two steps, a + // time-of-check-time-of-use bug might occur. Not a big deal, but worth avoiding. + // + // So, fancy-pants plan: + // 1. Create a transaction. + // 2. Insert into review queue in the transaction; on conflict, reject. + // This write-locks the database, so new URLs can be inserted. + // 3. Check if there's a conflicting URL in the database right now. If so, rollback and + // reject. + // + // Shouldn't cause a TOCTOU in this case, I think. + + let mut trans = self.pool.begin().await?; + + let result = sqlx::query( + "INSERT INTO review_queue + (sanitized_url, original_url) + VALUES + (?, ?) + ON CONFLICT DO NOTHING;", + ) + .bind(sanitized_url.as_str()) + .bind(original_url.as_str()) + .execute(&mut *trans) + .await?; + + if result.rows_affected() == 0 { + // Dropping a transaction rolls it back. + return Ok(SendToReviewResult::AlreadyOnReview); + } + + // Check for conflicting existing entries. + // Note: the check is done on the main database connection. + // This is fine with SQLite because the transaction made above is blocking all writes. + if let Some(existing) = self.get_url_full(sanitized_url).await? { + if existing.designation() == UrlDesignation::Spam { + // If this marks it as spam, reject. + return Ok(SendToReviewResult::AlreadyInDatabase(existing)); + } + + if existing.manually_reviewed() && existing.sanitized_url() == sanitized_url { + // If it's a perfect match and manually reviewed, reject. + return Ok(SendToReviewResult::AlreadyInDatabase(existing)); + } + } + + // None! Commit transaction. + trans.commit().await?; + + Ok(SendToReviewResult::Sent { + review_entry_id: result.last_insert_rowid(), + }) + } + + /// Returns database ID of the URL sent on review, and the URL itself. + /// + /// # Panics + /// + /// Panics if an invalid [`Url`] or [`SanitizedUrl`] is found in the database. + pub async fn get_url_for_review(&self) -> Result, Error> { + Ok(sqlx::query( + "UPDATE review_queue + SET last_sent_to_review=? + WHERE id in + (SELECT id FROM review_queue ORDER BY last_sent_to_review LIMIT 1) + RETURNING id, sanitized_url, original_url;", + ) + .bind(Utc::now()) + .fetch_optional(&self.pool) + .await? + .map(|row| { + ( + row.get(0), + SanitizedUrl::from_str(row.get(1)).expect("Invalid URL found in database!"), + Url::parse(row.get(2)).expect("Invalid URL found in database!"), + ) + })) + } + + /// Inform the database that this URL was sighted in this message in this chat and with this + /// name of the sender. + /// + /// This is done to later get them with [`Self::pop_review_link_sightings`] and remove them + /// if the review concludes that the link is spam. + pub async fn link_sighted( + &self, + chat_id: ChatId, + message_id: MessageId, + sender_name: &str, + link: &SanitizedUrl, + ) -> Result<(), Error> { + // Check if this is a sus link; if so, write down this sighting. + + let result = sqlx::query( + "INSERT INTO sus_link_sightings + (chat_id, message_id, sender_name, url_id) + VALUES + (?, ?, ?, (SELECT id FROM review_queue WHERE sanitized_url=?)) + ON CONFLICT DO NOTHING;", ) - .bind(message.chat.id.0) - .bind(message.id.0) - .bind(sendername) - .bind(rowid) + .bind(chat_id.0) + .bind(message_id.0) + .bind(sender_name) + .bind(link.as_str()) .execute(&self.pool) .await; - match err { - Err(Error::Database(dbe)) if dbe.kind() == ErrorKind::UniqueViolation => { - // That means we already have this. Good! + match result { + Err(Error::Database(e)) if e.kind() == ErrorKind::NotNullViolation => { + // This means the SELECT statement has returned nothing/NULL, + // which means this URL is not on review. That's fine, just ignore. } - _ => { - err?; + x => { + x?; } } Ok(()) } - /// Deletes from database and returns all sightings of this URL, - /// plus all sightings of other spam URLs, if any. - pub async fn drain_all_sightings_of_spam( + /// Delete a link from review with this ID. + /// + /// If there are existing sightings or keyboards for this URL, an error of + /// [`ErrorKind::ForeignKeyViolation`] is returned. + pub async fn delete_from_review(&self, id: i64) -> Result<(), Error> { + sqlx::query("DELETE FROM review_queue WHERE id=?") + .bind(id) + .execute(&self.pool) + .await?; + + Ok(()) + } + + /// Remove and return all sightings of a URL in the review queue at this ID. + /// + /// A sighting is a chat ID, a message ID, and name of the sender. + /// + /// If no matching URL is found, [`Error::RowNotFound`] is returned. + pub fn pop_review_link_sightings( &self, - link: &Url, - ) -> Result, Error> { + review_entry_id: i64, + ) -> BoxStream<'_, Result<(ChatId, MessageId, String), Error>> { sqlx::query( - "DELETE FROM sus_link_sightings - WHERE urlid IN ( - SELECT rowid FROM urls WHERE url=? OR is_spam=1 + "DELETE FROM sus_link_sightings WHERE url_id=? RETURNING chat_id, message_id, sender_name;", ) - RETURNING chatid, messageid, sendername;", + .bind(review_entry_id) + .map(|row: SqliteRow| { + ( + ChatId(row.get(0)), + MessageId(row.get(1)), + row.get::(2), + ) + }) + .fetch(&self.pool) + } + + /// Find up to one entry still in the review queue for which the best match in the URL + /// designations table is the an entry with this ID. + pub async fn find_one_matching_review_queue_entry( + &self, + url_entry_to_match_id: i64, + ) -> Result, Error> { + let mut stream = sqlx::query( + "SELECT + id, + sanitized_url + FROM review_queue;", ) - .bind(link.as_str()) - .map(|row: SqliteRow| (ChatId(row.get(0)), MessageId(row.get(1)), row.get(2))) - .fetch_all(&self.pool) - .await + .map(|row: SqliteRow| { + ( + row.get::(0), + SanitizedUrl::from_str(row.get(1)).expect("Invalid SanitizedUrl in database!"), + ) + }) + .fetch(&self.pool); + + while let Some((review_entry_id, sanitized_url)) = stream.try_next().await? { + let Some(info) = self.get_url(&sanitized_url).await? else { + // No existing entry. Probably still in review. + continue; + }; + + if info.id() == url_entry_to_match_id { + // That's it! + return Ok(Some(review_entry_id)); + } + } + + // None. + Ok(None) } - pub async fn delete_all_sightings_of(&self, link: &Url) -> Result<(), Error> { + /// Inform the database that a review keyboard was made for a review queue entry at this ID, + /// and that it's a message at this message ID and chat ID. + pub async fn review_keyboard_made( + &self, + chat_id: ChatId, + message_id: MessageId, + review_entry_id: i64, + ) -> Result<(), Error> { sqlx::query( - "DELETE FROM sus_link_sightings - WHERE urlid IN ( - SELECT rowid FROM urls WHERE url=? - );", + "INSERT INTO review_keyboards + (chat_id, message_id, url_id) + VALUES ($1, $2, $3) + ON CONFLICT DO UPDATE SET url_id=$3;", ) - .bind(link.as_str()) + .bind(chat_id.0) + .bind(message_id.0) + .bind(review_entry_id) .execute(&self.pool) .await?; + Ok(()) } - /// Check if this domain is protected against accidentally marking as spam. + /// Inform the database that there is no longer a review keyboard at this message ID and chat + /// ID. + pub async fn review_keyboard_removed( + &self, + chat_id: ChatId, + message_id: MessageId, + ) -> Result<(), Error> { + sqlx::query("DELETE FROM review_keyboards WHERE chat_id=? AND message_id=?;") + .bind(chat_id.0) + .bind(message_id.0) + .execute(&self.pool) + .await?; + + Ok(()) + } + + /// Remove review keyboards for this review queue entry and return them one by one. + pub fn pop_review_keyboards( + &self, + review_entry_id: i64, + ) -> BoxStream<'_, Result<(ChatId, MessageId), Error>> { + sqlx::query("DELETE FROM review_keyboards WHERE url_id=? RETURNING chat_id, message_id") + .bind(review_entry_id) + .map(|row: SqliteRow| (ChatId(row.get(0)), MessageId(row.get(1)))) + .fetch(&self.pool) + } + + /// Get total count of review queue entries. + pub fn get_review_count(&self) -> impl std::future::Future> + '_ { + sqlx::query("SELECT COUNT(*) FROM review_queue;") + .map(|row: SqliteRow| row.get(0)) + .fetch_one(&self.pool) + } + + /// Imports data from the old, pre-rewrite version of this bot. /// - /// Currently a stub with hardcoded checks. - pub async fn is_domain_protected(&self, domain: &Domain) -> Result { - match domain.as_ref() { - "youtube.com" | "youtu.be" | "t.me" => Ok(true), - _ => Ok(false), + /// For database schema of the old bot, see: + /// + pub async fn import_from_old_database(self: &Arc) -> Result<(), Error> { + enum IsSpamOld { + No = 0, + Yes = 1, + Maybe = 2, + } + impl From for IsSpamOld { + fn from(value: u8) -> Self { + use IsSpamOld::*; + match value { + value if value == No as u8 => No, + value if value == Yes as u8 => Yes, + value if value == Maybe as u8 => Maybe, + _ => panic!("Unknown value: {value}"), + } + } } - } -} -pub struct DomainVisitDebounceGuard { - database: Arc, - domain: Domain, -} + async fn receiver_task( + database: Arc, + receiver: flume::Receiver<(SanitizedUrl, Url, UrlDesignation)>, + ) { + while let Ok((sanitized_url, url, designation)) = receiver.recv() { + database + .insert_or_update_url(&sanitized_url, &url, designation, false) + .await + .expect("Failed to insert into database!"); + } + } -impl Drop for DomainVisitDebounceGuard { - fn drop(&mut self) { - let tokio_handle = tokio::runtime::Handle::current(); - let database = self.database.clone(); - let mut domain = Domain::new_invalid_unchecked(); + let (sender, receiver) = flume::bounded(64); - std::mem::swap(&mut self.domain, &mut domain); + let receiver_task = tokio::spawn(receiver_task(self.clone(), receiver)); - tokio_handle.spawn(async move { - database - .domains_currently_being_visited - .lock() + let oldpool = SqlitePoolOptions::new() + .max_connections(32) + .connect_with( + SqliteConnectOptions::from_str("sqlite:spam_domains.sqlite") + .expect("SQLite connect options should be valid") + .pragma("cache_size", "-32768") + .foreign_keys(true) // Already default, but doesn't hurt being explicit. + .busy_timeout(std::time::Duration::from_secs(600)), + ) + .await?; + + let oldpool_for_hide_deletes = oldpool.clone(); + let database_for_hide_deletes = self.clone(); + + let hide_deletes_task = tokio::spawn(async move { + let mut hide_deletes_stream = sqlx::query("SELECT chatid FROM hide_deletes") + .map(|row: SqliteRow| ChatId(row.get(0))) + .fetch(&oldpool_for_hide_deletes); + + while let Some(chatid) = hide_deletes_stream + .try_next() .await - .remove(&domain); - database.domains_visit_notify.notify_waiters(); + .expect("Old database died!") + { + database_for_hide_deletes + .set_hide_deletes(chatid, true) + .await + .expect("Database died!"); + } }); - } -} -impl Database { - /// Returns [`DomainVisitDebounceGuard`] if this domain isn't being visited, - /// or, if it is, blocks until that is done and then returns [`None`]. - pub async fn domain_visit_debounce( - self: &Arc, - domain: Domain, - ) -> Option { - // This is set to true if this domain was spotted to be in the process of being visited. - let mut was_visited = false; - - // Check if it's already being visited on loop until it's no longer being visited. - loop { - // Code below looks a bit weird, but it's to avoid a race condition. - // Consider the following scenario: - // 1. Task A is currently visiting a domain. - // 2. Task B runs this function to check it. - // 3. Task B finds the domain in the hash set in the check below. - // 4. Task A finishes visiting the domain and removes it from the - // hash set. - // 5. Task A then punts the Notify. - // 6. Task B, awaits the Notify *after* that. - // 7. Task B locks up until something else happens to punt the Notify. - // - // For this reason, task B has to ensure that task A can't punt the Notify - // after task B checked the hash set but before it awaits the Notify. - // - // This is accomplished with the visited_lock: the hash set is locked - // for checking, then Notify is awaited on, and only then it's unlocked. - - let mut visited_lock = self.domains_currently_being_visited.lock().await; - - let contains = visited_lock.contains(&domain); - if contains { - // It's being visited. Wait on notify and check again. - was_visited = true; - // Notified immediately starts listening as it is created. - let notify_waiter = self.domains_visit_notify.notified(); - drop(visited_lock); - notify_waiter.await; - } else { - // It is not or no longer being visited. - // Add it to the list and return the guard. - - if was_visited { - break None; - } + let old_domains_stream = sqlx::query("SELECT domain, example_url, is_spam FROM domains;") + .map(|row: SqliteRow| { + let domain = + SanitizedUrl::from_str(row.get(0)).expect("Invalid domain in database!"); + let example_url = + parse_url_like_telegram(row.get(1)).expect("Invalid example URL in database!"); + let is_spam = IsSpamOld::from(row.get::(2)); + + (domain, example_url, is_spam) + }) + .fetch(&oldpool); + + let old_urls_stream = sqlx::query("SELECT url, is_spam FROM urls") + .map(|row: SqliteRow| { + let (sanitized_url, url) = SanitizedUrl::from_str_with_original(row.get(0)) + .expect("Invalid example URL in database!"); + let is_spam = IsSpamOld::from(row.get::(1)); + + (sanitized_url, url, is_spam) + }) + .fetch(&oldpool); + + let mut old_urls_chain = + futures_util::StreamExt::chain(old_domains_stream, old_urls_stream); + + let mut counter = 0usize; + + while let Some((sanitized_url, url, is_spam)) = old_urls_chain.try_next().await? { + let designation = match is_spam { + IsSpamOld::Yes => UrlDesignation::Spam, + IsSpamOld::No => UrlDesignation::NotSpam, + IsSpamOld::Maybe => continue, + }; - visited_lock.insert(domain.clone()); - drop(visited_lock); + sender + .send((sanitized_url, url, designation)) + .expect("Send channel died!"); - break Some(DomainVisitDebounceGuard { - database: self.clone(), - domain, - }); + counter += 1; + + if counter.is_multiple_of(10000) { + log::info!("Migrated {counter} URLs from old database..."); } } + + log::info!("Done sending URLs for insertion..."); + drop(sender); + + receiver_task.await.expect("Receiver task failed!"); + log::info!("Waiting for hide deletes sending to be done..."); + hide_deletes_task.await.expect("Hide deletes task failed!"); + + log::info!("Old database imported."); + + Ok(()) } } #[cfg(test)] mod tests { + #![allow(clippy::unwrap_used)] use super::*; + use UrlDesignation::*; - type Ret = Result<(), Error>; + // Convenience testing methods for Database + impl Database { + /// Returns IDs of an inexact and an exact match. + async fn get_result(&self, url: &str) -> (Option, Option) { + let url: SanitizedUrl = url.parse().unwrap(); + let inexact = self.get_url(&url).await.unwrap().map(|x| x.id()); + let exact = self.get_url_exact(&url).await.unwrap().map(|x| x.id()); - #[tokio::test] - async fn create_db() -> Ret { - Database::new_test().await?; - Ok(()) - } + (inexact, exact) + } - #[tokio::test] - async fn is_url_spam() -> Ret { - let db = Database::new_test().await?; - let spam: Url = parse_url_like_telegram("amogus.com/badspam").unwrap(); + async fn insert(&self, url: &str) -> i64 { + let (sanitized, original) = SanitizedUrl::from_str_with_original(url).unwrap(); + self.insert_or_update_url(&sanitized, &original, NotSpam, true) + .await + .unwrap() + .id() + } + } - assert_eq!(db.is_url_spam(&spam, false).await?, None); - assert_eq!(db.is_spam(&spam, None, false).await?, None); + async fn new_db() -> Arc { + let db = Database::new_test().await.unwrap(); - db.add_url(&spam, IsSpam::Yes, false).await?; - assert_eq!( - db.is_url_spam(&spam, false).await?, - Some((IsSpam::Yes, false)) - ); + // Tests hardcode URL IDs here. + assert_eq!(1, db.insert("ftp://amogus.com/?b&a&a&c").await); + assert_eq!(2, db.insert("http://amogus.com/?a&d").await); + assert_eq!(3, db.insert("http://amogus.com/?e&a&b&c&d&e").await); + assert_eq!(4, db.insert("ftp://amogus.com/").await); + assert_eq!(5, db.insert("ftp://amogus.com/testpath/woot/").await); - let other: Url = parse_url_like_telegram("amogus.com/otherurl").unwrap(); - assert_eq!(db.is_url_spam(&other, false).await?, None); - - let spamdomain: Domain = Domain::from_url(&spam).unwrap(); - // This checks if the domain specifically is a spam, so it will return None. - assert_eq!(db.is_domain_spam(&spamdomain, false).await?, None); - Ok(()) + db } #[tokio::test] - async fn is_domain_spam() -> Ret { - let db = Database::new_test().await?; - let spamurl: Url = parse_url_like_telegram("amogus.com/badspam").unwrap(); - let spamdomain: Domain = Domain::from_url(&spamurl).unwrap(); - - assert_eq!(db.is_url_spam(&spamurl, false).await?, None); - assert_eq!(db.is_domain_spam(&spamdomain, false).await?, None); - assert_eq!(db.is_spam(&spamurl, None, false).await?, None); - assert_eq!(db.is_spam(&spamurl, Some(&spamdomain), false).await?, None); - - db.add_domain(&spamdomain, Some(&spamurl), IsSpam::Yes, false) - .await?; - // This checks if the URL specifically is a spam, so it will return None. - assert_eq!(db.is_url_spam(&spamurl, false).await?, None); - assert_eq!( - db.is_domain_spam(&spamdomain, false).await?, - Some((IsSpam::Yes, Some(spamurl.clone()), false)) - ); - assert_eq!( - db.is_spam(&spamurl, None, false).await?, - Some((IsSpam::Yes, false)) - ); - assert_eq!( - db.is_spam(&spamurl, Some(&spamdomain), false).await?, - Some((IsSpam::Yes, false)) - ); - - let other: Url = parse_url_like_telegram("amogus.com/otherurl").unwrap(); - let otherdomain: Domain = Domain::from_url(&other).unwrap(); - assert_eq!(spamdomain, otherdomain); - // This checks if the URL specifically is a spam, so it will return None. - assert_eq!(db.is_url_spam(&other, false).await?, None); - assert_eq!( - db.is_spam(&other, None, false).await?, - Some((IsSpam::Yes, false)) - ); - assert_eq!( - db.is_spam(&other, Some(&otherdomain), false).await?, - Some((IsSpam::Yes, false)) - ); - Ok(()) + async fn create_db() { + new_db().await; } + /// Should match exactly to 1 #[tokio::test] - async fn mark_sus_workflow() -> Ret { - let db = Database::new_test().await?; - let link = parse_url_like_telegram("example.com/notspam").unwrap(); - let domain = Domain::from_url(&link).unwrap(); - - // Let's say the link is posted in a chat. - // First, check `crate::spam_checker::check` is run, which defers to the db. - assert_eq!(db.is_spam(&link, None, false).await?, None); - - // Then, the checker determines it as not spam and adds it to - // the database. - db.add_domain(&domain, &link, IsSpam::No, false) - .await - .expect("Database died!"); - - // Then, someone marks it as sus. - assert_eq!(db.mark_sus(&link, None).await?, MarkSusResult::Marked); - - // Check if this is what it is in the database. - assert_eq!( - db.is_spam(&link, None, false).await?, - Some((IsSpam::Maybe, false)) - ); - - // Someone gets it in review... - let (review_url, review_table, review_id, db_state) = - db.get_url_for_review().await?.unwrap(); - assert_eq!(review_url, link); - assert_eq!(db_state, IsSpam::Maybe); - - // They mark it as not spam... - let from_db = db - .get_url_from_table_and_rowid(review_table, review_id) - .await? - .unwrap(); - assert_eq!(from_db.0, link); - let response = ReviewResponse::NotSpam(from_db.1, from_db.0); - db.read_review_response(&response).await?; - - // Someone later posts the link again. - // Check if this is what it is in the database. - assert_eq!( - db.is_spam(&link, None, false).await?, - Some((IsSpam::No, true)) - ); - - // Someone marks it as sus again... - assert_eq!( - db.mark_sus(&link, None).await?, - MarkSusResult::ManuallyReviewedNotSpam - ); - //db.mark_url_sus(&link).await?; + async fn match_params_exact() { + let db = new_db().await; + let (inexact, exact) = db.get_result("https://amogus.com/?a&c&b").await; + assert_eq!(inexact, Some(1)); + assert_eq!(exact, Some(1)); + } - //// It should still be not spam. - //assert_eq!(db.is_spam(&link, None, false).await?, Some(IsSpam::No)); + /// Has extraneous params but should match to 2 + #[tokio::test] + async fn match_params_with_extraneous() { + let db = new_db().await; + let (inexact, exact) = db.get_result("https://amogus.com/?a&d&c").await; + assert_eq!(inexact, Some(2)); + assert_eq!(exact, None); + } - Ok(()) + /// Has params that make it match both 1 and 2; the match with more params should win + #[tokio::test] + async fn match_params_with_multiple_matches() { + let db = new_db().await; + let (inexact, exact) = db.get_result("https://amogus.com/?b&a&c&d").await; + assert_eq!(inexact, Some(1)); + assert_eq!(exact, None); } + /// Has params but does not match anything with them; should match the URL without params. #[tokio::test] - async fn adding_links_actually_adds() -> Ret { - let url = parse_url_like_telegram("example.com/notspam").unwrap(); - let domain = Domain::from_url(&url).unwrap(); - - for spam_status in [IsSpam::No, IsSpam::Maybe, IsSpam::Yes] { - let db = Database::new_test().await?; - db.add_domain(&domain, &url, spam_status, false).await?; - assert_eq!( - db.is_spam(&url, &domain, true).await?, - Some((spam_status, false)) - ); - let db = Database::new_test().await?; - db.add_url(&url, spam_status, false).await?; - assert_eq!( - db.is_spam(&url, &domain, true).await?, - Some((spam_status, false)) - ); - } + async fn match_params_none() { + let db = new_db().await; + let (inexact, exact) = db.get_result("https://amogus.com/?b&a").await; + assert_eq!(inexact, Some(4)); + assert_eq!(exact, None); + } - Ok(()) + /// Has no params, but should be exact matches. + #[tokio::test] + async fn match_no_params_exact() { + let db = new_db().await; + let (inexact, exact) = db.get_result("https://amogus.com/").await; + assert_eq!(inexact, Some(4)); + assert_eq!(exact, Some(4)); } + /// Has no params, but should be exact matches. #[tokio::test] - async fn review_response_conflicts_with_db() -> Ret { - // Check all possible cases of `ReviewResponse` in - // relation to the database's state. - let url = parse_url_like_telegram("example.com/notspam").unwrap(); - let domain = Domain::from_url(&url).unwrap(); - - // Test cases. - let skip = ReviewResponse::Skip; - let notspam = ReviewResponse::NotSpam(Some(domain.clone()), url.clone()); - let urlspam = ReviewResponse::UrlSpam(Some(domain.clone()), url.clone()); - let domainspam = ReviewResponse::DomainSpam(domain.clone(), url.clone()); - - // Neither URL nor domain is in the database. - let db = Database::new_test().await?; - assert!(!skip.conflicts_with_db(&db).await?); - assert!(notspam.conflicts_with_db(&db).await?); - assert!(urlspam.conflicts_with_db(&db).await?); - assert!(domainspam.conflicts_with_db(&db).await?); + async fn match_no_params_exact_with_longer_path() { + let db = new_db().await; + let (inexact, exact) = db.get_result("https://amogus.com/testpath/woot/").await; + assert_eq!(inexact, Some(5)); + assert_eq!(exact, Some(5)); + } - // + /// Has no exact match, but should eventually descend and match 4. + #[tokio::test] + async fn match_no_params_inexact() { + let db = new_db().await; + let (inexact, exact) = db.get_result("https://amogus.com/testpath/").await; + assert_eq!(inexact, Some(4)); + assert_eq!(exact, None); + } - // The URL is marked as not spam. - let db = Database::new_test().await?; - db.add_url(&url, IsSpam::No, true).await?; - assert!(!skip.conflicts_with_db(&db).await?); - assert!(!notspam.conflicts_with_db(&db).await?); - assert!(urlspam.conflicts_with_db(&db).await?); - assert!(domainspam.conflicts_with_db(&db).await?); - - // The URL is marked as maybe spam. - let db = Database::new_test().await?; - db.add_url(&url, IsSpam::Maybe, true).await?; - assert!(!skip.conflicts_with_db(&db).await?); - assert!(notspam.conflicts_with_db(&db).await?); - assert!(urlspam.conflicts_with_db(&db).await?); - assert!(domainspam.conflicts_with_db(&db).await?); - - // The URL is marked as yes spam. - let db = Database::new_test().await?; - db.add_url(&url, IsSpam::Yes, true).await?; - assert!(!skip.conflicts_with_db(&db).await?); - assert!(notspam.conflicts_with_db(&db).await?); - assert!(!urlspam.conflicts_with_db(&db).await?); - assert!(domainspam.conflicts_with_db(&db).await?); + /// Has no exact match, but should eventually descend and match 4. + #[tokio::test] + async fn match_no_params_inexact_longer() { + let db = new_db().await; + let (inexact, exact) = db + .get_result("https://amogus.com/testpath/woot/aawagggga") + .await; + assert_eq!(inexact, Some(5)); + assert_eq!(exact, None); + } - // + /// Has no exact match, but should eventually descend and match 4. + #[tokio::test] + async fn match_no_params_inexact_with_input_params() { + let db = new_db().await; + let (inexact, exact) = db + .get_result("https://amogus.com/aawagga/amogus/?woot=3") + .await; + assert_eq!(inexact, Some(4)); + assert_eq!(exact, None); + } - // The domain is marked as not spam. - let db = Database::new_test().await?; - db.add_domain(&domain, &url, IsSpam::No, true).await?; - assert!(!skip.conflicts_with_db(&db).await?); - assert!(!notspam.conflicts_with_db(&db).await?); - assert!(urlspam.conflicts_with_db(&db).await?); - assert!(domainspam.conflicts_with_db(&db).await?); - - // The domain is marked as maybe spam. - let db = Database::new_test().await?; - db.add_domain(&domain, &url, IsSpam::Maybe, true).await?; - assert!(!skip.conflicts_with_db(&db).await?); - assert!(notspam.conflicts_with_db(&db).await?); - assert!(urlspam.conflicts_with_db(&db).await?); - assert!(domainspam.conflicts_with_db(&db).await?); - - // The domain is marked as yes spam. - let db = Database::new_test().await?; - db.add_domain(&domain, &url, IsSpam::Yes, true).await?; - assert!(!skip.conflicts_with_db(&db).await?); - assert!(notspam.conflicts_with_db(&db).await?); - assert!(urlspam.conflicts_with_db(&db).await?); - assert!(!domainspam.conflicts_with_db(&db).await?); + #[tokio::test] + async fn get_url_short_with_params() { + let db = new_db().await; + let UrlInfoShort { + id, + param_count, + designation, + manually_reviewed, + } = db.get_url_by_id_short(3).await.unwrap().unwrap(); + assert_eq!(id, 3); + assert_eq!(param_count, 5); + assert_eq!(designation, NotSpam); + assert!(manually_reviewed); + } - Ok(()) + #[tokio::test] + async fn get_url_short_with_path() { + let db = new_db().await; + let UrlInfoShort { + id, + param_count, + designation, + manually_reviewed, + } = db.get_url_by_id_short(5).await.unwrap().unwrap(); + assert_eq!(id, 5); + assert_eq!(param_count, 0); + assert_eq!(designation, NotSpam); + assert!(manually_reviewed); } #[tokio::test] - async fn marking_telegram_as_spam_by_accident() -> Ret { - // Scenario: - // 1. Someone puts a normal non-spam telegram link in review. - // 2. Admin marks it as not spam. - // 3. Someone puts a spam telegram link in review. - // 4. Admin accidentally responds that telegram's entire domain is spam. - // - // The make_a_db() function below creates a database in this state. + async fn get_url_full_with_params() { + let db = new_db().await; + let UrlInfoFull { + short: + UrlInfoShort { + id, + param_count, + designation, + manually_reviewed, + }, + sanitized_url, + original_url, + } = db.get_url_by_id_full(3).await.unwrap().unwrap(); + + assert_eq!(id, 3); + assert_eq!(param_count, 5); + assert_eq!(designation, NotSpam); + assert!(manually_reviewed); + assert_eq!(sanitized_url.as_str(), "https://amogus.com/?a&b&c&d&e"); + assert_eq!(original_url.as_str(), "http://amogus.com/?e&a&b&c&d&e"); + } - let spam: Url = parse_url_like_telegram("t.me/badspam").unwrap(); - let normal: Url = parse_url_like_telegram("t.me/channels").unwrap(); - let tg = parse_url_like_telegram("t.me").unwrap(); - let tgdomain = Domain::from_url(&tg).unwrap(); + #[tokio::test] + async fn get_url_full_with_path() { + let db = new_db().await; + let UrlInfoFull { + short: + UrlInfoShort { + id, + param_count, + designation, + manually_reviewed, + }, + sanitized_url, + original_url, + } = db.get_url_by_id_full(5).await.unwrap().unwrap(); + + assert_eq!(id, 5); + assert_eq!(param_count, 0); + assert_eq!(designation, NotSpam); + assert!(manually_reviewed); + assert_eq!(sanitized_url.as_str(), "https://amogus.com/testpath/woot"); + assert_eq!(original_url.as_str(), "ftp://amogus.com/testpath/woot/"); + } - /// Make a database with initial state. - async fn make_a_db() -> Result, Error> { - let db = Database::new_test().await?; - let spam: Url = parse_url_like_telegram("t.me/badspam").unwrap(); - let normal: Url = parse_url_like_telegram("t.me/channels").unwrap(); - let tg = parse_url_like_telegram("t.me").unwrap(); - let tgdomain = Domain::from_url(&tg).unwrap(); + #[tokio::test] + async fn review_push_results() { + let db = new_db().await; - // Let's say someone marks Telegram itself as spam on accident. + let urls = SanitizedUrl::from_str_with_original("amogus.com").unwrap(); + let result = db.send_to_review(&urls.0, &urls.1).await.unwrap(); + assert!(matches!(result, SendToReviewResult::AlreadyInDatabase(_))); - // Someone gets the normal link in review... - assert_eq!(db.mark_sus(&normal, None).await?, MarkSusResult::Marked); + let urls = SanitizedUrl::from_str_with_original("somenewurl.com").unwrap(); - // Someone gets it in review... - let (_, review_table, review_id, _) = db.get_url_for_review().await?.unwrap(); + let result = db.send_to_review(&urls.0, &urls.1).await.unwrap(); + assert!(matches!( + result, + SendToReviewResult::Sent { review_entry_id: 1 } + )); - // They mark it as not spam... - let from_db = db - .get_url_from_table_and_rowid(review_table, review_id) - .await? - .unwrap(); - assert_eq!(from_db.0, normal); - let response = ReviewResponse::NotSpam(from_db.1, from_db.0); - db.read_review_response(&response).await?; + let result = db.send_to_review(&urls.0, &urls.1).await.unwrap(); + assert!(matches!(result, SendToReviewResult::AlreadyOnReview)); + } - // Someone gets the spam link in review... - assert_eq!(db.mark_sus(&spam, None).await?, MarkSusResult::Marked); + #[tokio::test] + async fn review_push_get() { + let db = new_db().await; - // Someone gets it in review... - let (_, review_table, review_id, _) = db.get_url_for_review().await?.unwrap(); + let urls = SanitizedUrl::from_str_with_original("somenewurl.com").unwrap(); + let send_result = db.send_to_review(&urls.0, &urls.1).await.unwrap(); + assert!(matches!( + send_result, + SendToReviewResult::Sent { review_entry_id: 1 } + )); - // They mark the DOMAIN as spam on accident... - let from_db = db - .get_url_from_table_and_rowid(review_table, review_id) - .await? - .unwrap(); - assert_eq!(from_db.0, spam); - let response = ReviewResponse::DomainSpam(tgdomain.clone(), from_db.0); - db.read_review_response(&response).await?; - - // Oh no. The normal link is spam too. - assert_eq!( - db.is_spam(&normal, None, false).await.unwrap(), - Some((IsSpam::Yes, true)) - ); + let get_result = db.get_url_for_review().await.unwrap().map(|x| (x.1, x.2)); + assert_eq!(get_result, Some(urls.clone())); - // How will our heroes get out of this one? Find out on next episode of... - Ok(db) - } + let get_result = db.get_url_for_review().await.unwrap().map(|x| (x.1, x.2)); + assert_eq!(get_result, Some(urls)); + } - // Scenario continuation: - // 5. Admin tries to fix this by marking "t.me" as not spam. - let db = make_a_db().await?; - let response = ReviewResponse::NotSpam(Some(tgdomain.clone()), tg); - db.read_review_response(&response).await?; + /// When requesting a URL for review, it should return the oldest sent, so all URLs are cycled + /// through. + #[tokio::test] + async fn review_push_get_multiple() { + let db = new_db().await; + + let urls_a = SanitizedUrl::from_str_with_original("a.com").unwrap(); + let send_result = db.send_to_review(&urls_a.0, &urls_a.1).await.unwrap(); + assert!(matches!( + send_result, + SendToReviewResult::Sent { review_entry_id: 1 } + )); + + let urls_b = SanitizedUrl::from_str_with_original("b.com").unwrap(); + let send_result = db.send_to_review(&urls_b.0, &urls_b.1).await.unwrap(); + assert!(matches!( + send_result, + SendToReviewResult::Sent { review_entry_id: 2 } + )); + + let get_a = db.get_url_for_review().await.unwrap().map(|x| (x.1, x.2)); + assert_eq!(get_a, Some(urls_a.clone())); + + let get_b = db.get_url_for_review().await.unwrap().map(|x| (x.1, x.2)); + assert_eq!(get_b, Some(urls_b.clone())); + + let get_a = db.get_url_for_review().await.unwrap().map(|x| (x.1, x.2)); + assert_eq!(get_a, Some(urls_a.clone())); + + let get_b = db.get_url_for_review().await.unwrap().map(|x| (x.1, x.2)); + assert_eq!(get_b, Some(urls_b.clone())); + } - // Normal link shouldn't be spam now. - assert_eq!( - db.is_spam(&normal, None, false).await.unwrap(), - Some((IsSpam::No, true)) - ); - // In this case the spam link isn't either though. - assert_eq!( - db.is_spam(&spam, None, false).await.unwrap(), - Some((IsSpam::No, true)) - ); + /// Basically test that it doesn't crash lol + #[tokio::test] + async fn sus_link_sighting() { + let db = new_db().await; + let urls = SanitizedUrl::from_str_with_original("example.com").unwrap(); - // Scenario continuation (skill issue case): - // 5. Admin tries to fix this by marking the spam link as URL spam. - let db = make_a_db().await?; - let response = ReviewResponse::UrlSpam(Some(tgdomain), spam.clone()); - db.read_review_response(&response).await?; + // This URL is not in the database. Should do nothing. + db.link_sighted(ChatId(0), MessageId(0), "hi", &urls.0) + .await + .unwrap(); - // Normal link should still be spam: marking a - // particular link as spam shouldn't unmark the domain. - assert_eq!( - db.is_spam(&normal, None, false).await.unwrap(), - Some((IsSpam::Yes, true)) - ); - // In this case the spam link should still be considered spam. - assert_eq!( - db.is_spam(&spam, None, false).await.unwrap(), - Some((IsSpam::Yes, true)) - ); + let send_result = db.send_to_review(&urls.0, &urls.1).await.unwrap(); + assert!(matches!( + send_result, + SendToReviewResult::Sent { review_entry_id: 1 } + )); - Ok(()) + // This URL is now in the database. Should do something now. + db.link_sighted(ChatId(0), MessageId(0), "hi", &urls.0) + .await + .unwrap(); } + // Simple boolean logic, but yeag. #[tokio::test] - async fn mark_url_sus_if_domain_marked() -> Ret { - // Scenario: - // 1. A domain example.org is marked as spam. - // 2. Someone tries to mark example.org/12345 as sus. - // - // They should get the "is already marked as spam" response. - - let db = Database::new_test().await?; + async fn hide_deletes() { + let db = new_db().await; + + let chat_id = ChatId(1312); + + // false by default + assert!(!db.get_hide_deletes(chat_id).await.unwrap()); + assert!(!db.get_hide_deletes(chat_id).await.unwrap()); + // setting to false does nothing + assert!(!db.set_hide_deletes(chat_id, false).await.unwrap()); + assert!(!db.set_hide_deletes(chat_id, false).await.unwrap()); + // setting to true, well, sets to true + assert!(!db.set_hide_deletes(chat_id, true).await.unwrap()); + assert!(db.set_hide_deletes(chat_id, true).await.unwrap()); + assert!(db.get_hide_deletes(chat_id).await.unwrap()); + assert!(db.get_hide_deletes(chat_id).await.unwrap()); + // setting to false sets to false lol + assert!(db.set_hide_deletes(chat_id, false).await.unwrap()); + assert!(!db.set_hide_deletes(chat_id, false).await.unwrap()); + } - let spam_url = parse_url_like_telegram("example.org/12345").unwrap(); - let spam_domain = Domain::from_url(&spam_url).unwrap(); + #[tokio::test] + async fn last_deleted_album_id() { + let db = new_db().await; + let chat_id = ChatId(1312); - db.add_domain(&spam_domain, None, IsSpam::Yes, true).await?; - let sus_result = db.mark_sus(&spam_url, None).await?; - assert_eq!(sus_result, MarkSusResult::AlreadyMarkedSpam); + let album_id = MediaGroupId(String::from("amogus")); - // However, if it's marked as *not* spam, - // then trying to mark sus *should* succeed. + assert_eq!(db.get_last_deleted_album_id(chat_id).await.unwrap(), None); - db.add_domain(&spam_domain, None, IsSpam::No, true).await?; - let sus_result = db.mark_sus(&spam_url, None).await?; - assert_eq!(sus_result, MarkSusResult::Marked); + db.set_last_deleted_album_id(chat_id, &album_id) + .await + .unwrap(); - Ok(()) + assert_eq!( + db.get_last_deleted_album_id(chat_id).await.unwrap(), + Some(album_id) + ); } } diff --git a/anti_nft_spam_bot/src/database/types.rs b/anti_nft_spam_bot/src/database/types.rs new file mode 100644 index 0000000..ac087c1 --- /dev/null +++ b/anti_nft_spam_bot/src/database/types.rs @@ -0,0 +1,150 @@ +use std::fmt::Display; + +use url::Url; + +use crate::{sanitized_url::SanitizedUrl, types::UrlDesignation}; + +/// Shortened info of a database entry on a URL. +#[derive(Clone, Copy, Debug, Hash)] +pub struct UrlInfoShort { + /// ID of the entry. + pub(super) id: i64, + /// Amount of `?query` parameters of the URL this entry is for. + pub(super) param_count: i64, + /// Designation of this URL. + pub(super) designation: UrlDesignation, + /// Whether or not this designation has been manually decided on by a reviewer. + pub(super) manually_reviewed: bool, +} + +impl UrlInfoShort { + /// Returns the ID of the entry. + #[must_use] + pub fn id(&self) -> i64 { + self.id + } + /// Returns the amount of `?query` parameters of the URL this entry is for. + #[must_use] + #[allow(unused)] + pub fn param_count(&self) -> i64 { + self.param_count + } + /// Returns the designation of this URL. + #[must_use] + pub fn designation(&self) -> UrlDesignation { + self.designation + } + /// Returns whether or not this entry's designation has been manually decided on by a reviewer. + #[must_use] + pub fn manually_reviewed(&self) -> bool { + self.manually_reviewed + } +} + +impl Display for UrlInfoShort { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "ID: {}", self.id)?; + writeln!(f, "Parameter count: {}", self.param_count)?; + writeln!(f, "Designation: {}", self.designation)?; + writeln!(f, "Manually reviewed: {}", self.manually_reviewed) + } +} + +/// Full info of a database entry on a URL. +#[derive(Clone, Debug)] +pub struct UrlInfoFull { + /// Shortened info this is a superset of. + pub(super) short: UrlInfoShort, + /// Sanitized URL this entry represents. + pub(super) sanitized_url: SanitizedUrl, + /// Original URL this entry was based on. + pub(super) original_url: Url, +} + +impl UrlInfoFull { + /// Returns the ID of the entry. + #[must_use] + #[allow(unused)] + pub fn id(&self) -> i64 { + self.short.id + } + /// Returns the amount of `?query` parameters of the URL this entry is for. + #[must_use] + #[allow(unused)] + pub fn param_count(&self) -> i64 { + self.short.param_count + } + /// Returns the designation of this URL. + #[must_use] + pub fn designation(&self) -> UrlDesignation { + self.short.designation + } + /// Returns whether or not this entry's designation has been manually decided on by a reviewer. + #[must_use] + pub fn manually_reviewed(&self) -> bool { + self.short.manually_reviewed + } + /// Returns the short info of this entry. + #[allow(unused)] + #[must_use] + pub fn short(&self) -> &UrlInfoShort { + &self.short + } + /// Returns the sanitized URL this entry represents. + #[must_use] + pub fn sanitized_url(&self) -> &SanitizedUrl { + &self.sanitized_url + } + /// Returns the original URL this entry was based on. + #[must_use] + #[allow(unused)] + pub fn original_url(&self) -> &Url { + &self.original_url + } +} + +impl Display for UrlInfoFull { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "Sanitized URL: {}", self.sanitized_url)?; + writeln!(f, "Original URL: {}", self.original_url)?; + self.short.fmt(f) + } +} + +/// Result of [`Database::send_to_review`]. +/// +/// [`Database::send_to_review`]: super::Database::send_to_review +#[derive(Clone, Debug)] +pub enum SendToReviewResult { + /// The URL was successfully sent for review and is now under the given ID. + Sent { review_entry_id: i64 }, + /// This URL is already in the review queue. + AlreadyOnReview, + /// This URL is already in the database. + AlreadyInDatabase(UrlInfoFull), +} + +/// Result of [`Database::insert_or_update_url`]. +/// +/// [`Database::insert_or_update_url`]: super::Database::insert_or_update_url +#[derive(Clone, Copy, Hash, Debug)] +pub enum InsertOrUpdateResult { + /// New URL entry was inserted into the database. + Inserted { new_id: i64 }, + /// An existing URL entry was updated in the database with new info. + Updated { old_info: UrlInfoShort }, + /// No change was done. + NoChange { existing_info: UrlInfoShort }, +} + +impl InsertOrUpdateResult { + /// Returns ID of either the existing data or the newly inserted data. + #[must_use] + pub fn id(&self) -> i64 { + match self { + Self::Inserted { new_id } => *new_id, + Self::Updated { old_info } => old_info.id(), + Self::NoChange { existing_info } => existing_info.id(), + } + } +} diff --git a/anti_nft_spam_bot/src/entry.rs b/anti_nft_spam_bot/src/entry.rs index efc5a05..08584ae 100644 --- a/anti_nft_spam_bot/src/entry.rs +++ b/anti_nft_spam_bot/src/entry.rs @@ -2,8 +2,9 @@ use std::{fs, sync::Arc}; use teloxide::{dptree::deps, prelude::*}; use crate::{ + actions::remind_about_reviews_spinloop, database::Database, - handlers::{generate_bot_commands, reviews::parse_callback_query}, + handlers::{generate_bot_commands, handle_callback_query}, }; /// # Panics @@ -19,28 +20,37 @@ pub async fn entry() { let bot = Bot::new(key); - let commands = generate_bot_commands(); - bot.set_my_commands(commands) + bot.set_my_commands(generate_bot_commands()) .await .expect("Failed to set bot commands!"); - let db: Arc = Database::new().await.unwrap(); + let database: Arc = Database::new().await.expect("Failed to create database!"); + + if let Err(e) = database.import_from_old_database().await { + log::warn!("Failed to import from old database: {e}"); + }; + + tokio::spawn(remind_about_reviews_spinloop( + bot.clone(), + Arc::downgrade(&database), + )); log::info!("Creating the handler..."); let handler = dptree::entry() - .branch(Update::filter_message().branch(dptree::endpoint(crate::handlers::handle_message))) - .branch( - Update::filter_edited_message() - .branch(dptree::endpoint(crate::handlers::handle_message)), - ) - .branch(Update::filter_callback_query().endpoint(parse_callback_query)); + .branch(Update::filter_message().branch(dptree::endpoint( + crate::handlers::handle_message_new_or_edit, + ))) + .branch(Update::filter_edited_message().branch(dptree::endpoint( + crate::handlers::handle_message_new_or_edit, + ))) + .branch(Update::filter_callback_query().endpoint(handle_callback_query)); log::info!("Dispatching the dispatcher!"); Dispatcher::builder(bot, handler) .default_handler(|_| async {}) - .dependencies(deps![db]) + .dependencies(deps![database]) .enable_ctrlc_handler() .build() .dispatch() diff --git a/anti_nft_spam_bot/src/handlers.rs b/anti_nft_spam_bot/src/handlers.rs new file mode 100644 index 0000000..5e6def9 --- /dev/null +++ b/anti_nft_spam_bot/src/handlers.rs @@ -0,0 +1,840 @@ +// yo dawg we heard you like imports + +use std::sync::Arc; + +use arch_bot_commons::useful_methods::{BotArchSendMsg, MessageStuff}; +use teloxide::{ + prelude::*, + sugar::request::{RequestLinkPreviewExt, RequestReplyExt}, + types::{BotCommand, MaybeInaccessibleMessage, Me}, + RequestError, +}; + +use crate::{ + actions::{ + authenticate_control, authenticate_control_of_sender, delete_message_as_spam, + discard_review_keyboard, edit_message_into_a_new_review_keyboard, + insert_or_update_url_with_log, send_new_review_keyboard, send_review_header, + send_review_keyboard, + }, + database::{Database, InsertOrUpdateResult, SendToReviewResult}, + misc::{ + does_message_have_spam_links, get_entity_url, is_sender_admin, is_sender_admin_with_cache, + iterate_over_all_links, sender_name_prettyprint, user_name_prettyprint, + }, + types::{MessageDeleteReason, ReviewCallbackData, UrlDesignation}, + CONTROL_CHAT_ID, +}; + +/// Handler for events of new or edited messages. +pub async fn handle_message_new_or_edit( + bot: Bot, + me: Me, + message: Message, + database: Arc, +) -> Result<(), RequestError> { + if let Some(sender) = &message.from { + if sender.id == me.id { + // Ignore messages sent by ourselves. + // Probably won't appear ever, but eh. + return Ok(()); + } + } + + let sender_name = sender_name_prettyprint(&message, false); + + // Whether or not the sender of this message is an admin, if checked. + // This variable is used to have to check this at most once across this function. + let mut sent_by_admin_cache: Option = None; + + // This block below handles checking for spam and deleting the message, either now, or later due + // to a review of one of the URLs contained within. + if !message.chat.is_private() { + let mut deleted_this_message = false; + + // This message might be in an album that we want to delete. + if let Some(album_id) = message.media_group_id() { + let last_deleted = database + .get_last_deleted_album_id(message.chat.id) + .await + .expect("Database died!"); + + if last_deleted.as_ref() == Some(album_id) { + // Matches album ID of last deleted spam message. Delete this too. + delete_message_as_spam( + &bot, + &database, + &message, + Some(&sender_name), + MessageDeleteReason::OfAlbumWithSpamMessage, + ) + .await?; + + deleted_this_message = true; + } + } + + if !deleted_this_message && does_message_have_spam_links(&message, &database).await { + if is_sender_admin_with_cache(&bot, &message, &mut sent_by_admin_cache).await? { + if message.chat.id != CONTROL_CHAT_ID { + bot.archsendmsg_no_link_preview( + message.chat.id, + "Skipping deleting a message from an admin containing a spam link.", + None, + ) + .await?; + } + } else { + delete_message_as_spam( + &bot, + &database, + &message, + Some(&sender_name), + MessageDeleteReason::ContainsSpamLink, + ) + .await?; + deleted_this_message = true; + } + } + + if let Some(reply_to) = message.reply_to_message() { + // If this is a reply to another message within the same chat, check that message for + // spam links too. Probably a second time over. A link in that message might have been + // marked as spam since then. + if message.chat.id == reply_to.chat.id + && does_message_have_spam_links(reply_to, &database).await + && !is_sender_admin(&bot, reply_to).await? + { + delete_message_as_spam( + &bot, + &database, + reply_to, + None, + MessageDeleteReason::ContainsSpamLink, + ) + .await?; + } + } + + if deleted_this_message { + // This message was deleted as spam. No need for any further handling. + return Ok(()); + } + + // Above passed. This message was not deleted as spam. + // + // However, some of the links might be on review, and later might be marked as spam. + // In such case, we want to retroactively delete all messages where that link was seen that + // are elegible for deletion. + // + // Therefore, tell the database about the links. + for (sanitized_url, _original_url) in iterate_over_all_links(&message) { + // Only if this is *not* sent by a chat admin. We don't want to delete admin messages + // containing spam links. + if !is_sender_admin_with_cache(&bot, &message, &mut sent_by_admin_cache).await? { + if let Err(e) = database + .link_sighted(message.chat.id, message.id, &sender_name, &sanitized_url) + .await + { + log::error!("Database failed to sight link {sanitized_url}!\n{e:?}"); + } + } + } + } + + // Above passed. This message was not deleted as spam and links were sighted. + let is_edited = message.edit_date().is_some(); + + if !is_edited { + // Handle commands, potentially. + handle_command( + &bot, + &me, + &message, + &database, + &sender_name, + sent_by_admin_cache, + ) + .await?; + } + + Ok(()) +} + +/// Check if this message is a command, and if so, handle it. +/// +/// # Panics +/// +/// Panics if the database dies lol +pub async fn handle_command( + bot: &Bot, + me: &Me, + message: &Message, + database: &Database, + sender_name: &str, + mut sent_by_admin_cache: Option, +) -> Result<(), RequestError> { + let Some(mut text) = message.text_full() else { + // shrug + return Ok(()); + }; + + let is_private = message.chat.is_private(); + + // Special case: nag for cash money if someone says "good bot" uwu + static GOOD_BOT: &str = "good bot"; + if let Some((maybe, _)) = text.split_at_checked(GOOD_BOT.len()) { + if maybe.eq_ignore_ascii_case(GOOD_BOT) + && (is_private + || message + .reply_to_message() + .and_then(|x| x.from.as_ref()) + .is_some_and(|x| x.id == me.id)) + { + static NAG: &str = + "(Consider supporting? 👉👈)"; + bot.send_message(message.chat.id, NAG) + .reply_to(message.id) + .parse_mode(teloxide::types::ParseMode::Html) + .disable_link_preview(true) + .await?; + return Ok(()); + } + } + + // Commands start with a forward slash. + if !text.starts_with('/') { + // Not a command. Is this in DMs to the bot? If so, think of this as "/start" lol + if is_private { + text = "/start"; + } else { + return Ok(()); + } + } + + // Get first word in message, the command itself. + let Some(mut command) = text.split_whitespace().next() else { + return Ok(()); + }; + + let command_full_len = command.len(); + + // Strip bot username from the end such that a command like "/spam@Anti_NFT_Spam_Bot" would + // become just "/spam" + if let Some(command_no_username) = command.strip_suffix(me.username()) { + if let Some(command_no_username_and_at) = command_no_username.strip_suffix('@') { + command = command_no_username_and_at; + } + } + + let _params = &text[command_full_len..].trim_start(); + + // Lowercase, if needed. + let tmp; + if !command.chars().map(char::is_lowercase).any(|x| !x) { + tmp = command.to_lowercase(); + command = tmp.as_str(); + } + + match command { + "/start" if is_private => { + bot.archsendmsg_no_link_preview( + message.chat.id, concat!( +"This bot is made to combat various types of spam experienced by chats across Telegram.\n\n", +"To use this bot, add it to a chat and give it administrator status with \"Remove messages\" permission.\n\n", +"No further setup is required. A message will be sent when spam is removed.\n\n", +"For available commands, type / into the message text box below and see the previews.\n\n" +), + message.id, + ) + .await?; + + let is_reviewer = authenticate_control_of_sender(bot, message).await?; + + if is_reviewer { + // Also note super special super secret commands. + bot.archsendmsg_no_link_preview(message.chat.id, concat!( +"Super special reviewer commands:\n\n", + +"/mark_spam <URL>, /mark_url_spam <URL> - Insert or update an entry for a URL as spam.\n\n", + +"/mark_domain_spam <URL> - Remove path and query parameters from the URL then insert/update an ", +"entry for the resulting host-only URL as spam.\n\n", + +"/mark_not_spam <URL> - Insert or update an entry for a URL as not spam.\n\n", + +"/mark_aggregator <URL> - Insert or update an entry for a URL as a link aggregator. ", +"A link aggregator is considered to be not spam itself, but URLs below it will be automatically checked.\n\n", + +"/review - Initiate a review keyboard.\n\n", + +"/info <URL> - Find a database entry that matches this URL and print its contents.\n\n", +), message.id) + .await?; + } + + Ok(()) + } + "/hide_deletes" | "/show_deletes" => { + handle_command_show_hide_deletes( + bot, + message, + database, + &mut sent_by_admin_cache, + command, + ) + .await + } + "/spam" | "/scam" => handle_command_spam(bot, message, database, sent_by_admin_cache).await, + "/mark_spam" | "/mark_url_spam" | "/mark_domain_spam" | "/mark_not_spam" + | "/mark_aggregator" => { + handle_command_mark(bot, message, database, sender_name, command).await + } + "/review" => handle_command_review(bot, message, database).await, + "/info" => handle_command_info(bot, message, database).await, + // NOTE: When adding new commands, also add them to `generate_bot_commands` function below. + _ if is_private => { + bot.archsendmsg_no_link_preview( + message.chat.id, + "Unknown command. Try /help for a list of commands.", + message.id, + ) + .await?; + Ok(()) + } + _ => { + // Woop. + Ok(()) + } + } +} + +/// Generate a list of bot commands to be shown as available by the bot. +#[must_use] +pub fn generate_bot_commands() -> Vec { + vec![ + BotCommand::new("/spam", "Mark links in a message for review as spam."), + BotCommand::new("/hide_deletes", "Hide spam deletion notification messages."), + BotCommand::new( + "/show_deletes", + "Don't hide spam deletion notification messages.", + ), + ] +} + +/// Handle this message assuming it's the command `/review`. +async fn handle_command_review( + bot: &Bot, + message: &Message, + database: &Database, +) -> Result<(), RequestError> { + if !authenticate_control_of_sender(bot, message).await? { + return Ok(()); + } + + send_new_review_keyboard(bot, message.chat.id, Some(message.id), database).await?; + + Ok(()) +} + +/// Handle this message assuming it's the command `/show_deletes` or `/hide_deletes`. +async fn handle_command_show_hide_deletes( + bot: &Bot, + message: &Message, + database: &Database, + sent_by_admin_cache: &mut Option, + command: &str, +) -> Result<(), RequestError> { + if message.chat.is_private() + || !is_sender_admin_with_cache(bot, message, sent_by_admin_cache).await? + { + bot.archsendmsg_no_link_preview( + message.chat.id, + "This command can only be used by admins in group chats.", + message.id, + ) + .await?; + return Ok(()); + } + + let new_state = command == "/hide_deletes"; + + let old_state = database + .set_hide_deletes(message.chat.id, new_state) + .await + .expect("Database died!"); + + let response = match (old_state, new_state) { + (false, false) => "This chat doesn't hide spam deletion notifications already.", + (false, true) => concat!( + "I will no longer notify about messages being deleted. ", + "Note that this may lead to confusion in case I delete a ", + "message with a legitimate link due to a false positive. " + ), + (true, false) => "From now on I will notify about spam messages being deleted.", + (true, true) => "This chat has spam delete notifications hidden already.", + }; + + bot.archsendmsg_no_link_preview(message.chat.id, response, message.id) + .await?; + Ok(()) +} + +/// Handle this message assuming it's the command `/hide`. +async fn handle_command_info( + bot: &Bot, + message: &Message, + database: &Database, +) -> Result<(), RequestError> { + if !(message.chat.is_private() || message.chat.id == CONTROL_CHAT_ID) { + // Not an appropriate chat for this. + return Ok(()); + } + + if !authenticate_control_of_sender(bot, message).await? { + // Not someone who should be able to query stuff. + return Ok(()); + } + + let mut response = String::with_capacity(64); + + for (sanitized_url, _original_url) in iterate_over_all_links(message).chain( + message + .reply_to_message() + .into_iter() + .flat_map(iterate_over_all_links), + ) { + use std::fmt::Write; + writeln!(response, "For URL {sanitized_url}:") + .expect("Writing to a String never fails"); + + let short = match database.get_url(&sanitized_url).await { + Err(e) => { + write!(response, "DATABASE ERROR ON SHORT: {e:#?}\n\n") + .expect("Writing to a String never fails"); + continue; + } + Ok(Some(short)) => short, + Ok(None) => { + response.push_str("No result.\n\n"); + continue; + } + }; + + let long = match database.get_url_by_id_full(short.id()).await { + Err(e) => { + write!(response, "DATABASE ERROR ON LONG: {e:#?}\n\n") + .expect("Writing to a String never fails"); + continue; + } + Ok(Some(short)) => short, + Ok(None) => { + response.push_str("Result has vanished.\n\n"); + continue; + } + }; + + write!(response, "{long}\n\n").expect("Writing to a String never fails"); + } + + let response = if response.is_empty() { + "No URLs found to find info on." + } else { + &response + }; + + bot.archsendmsg_no_link_preview(message.chat.id, response, message.id) + .await?; + + Ok(()) +} + +/// Handle this message assuming it's the command `/mark_spam`, `/mark_url_spam`, `/mark_domain_spam`, +/// `/mark_not_spam`, or `/mark_aggregator`. +/// +/// # Panics +/// Panics if `command` is not one of those. +async fn handle_command_mark( + bot: &Bot, + message: &Message, + database: &Database, + sender_name: &str, + command: &str, +) -> Result<(), RequestError> { + if !(message.chat.is_private() || message.chat.id == CONTROL_CHAT_ID) { + // Not an appropriate chat for this. + return Ok(()); + } + + if !authenticate_control_of_sender(bot, message).await? { + // Not someone who can review stuff. + return Ok(()); + } + + let (designation, header) = match command { + "/mark_spam" | "/mark_url_spam" | "/mark_domain_spam" => { + (UrlDesignation::Spam, "Marked these URLs as spam:\n") + } + "/mark_not_spam" => (UrlDesignation::NotSpam, "Marked as not spam:\n"), + "/mark_aggregator" => (UrlDesignation::Aggregator, "Marked as a link aggregator:\n"), + _ => unreachable!(), + }; + + let mut response = String::with_capacity(64); + + // True if this has changed at least one link in the database. + let mut had_links_changed = false; + // True if this tried to write a link that was already written as is in the database. + let mut had_links_unchanged = false; + + for (mut sanitized_url, mut original_url) in iterate_over_all_links(message) { + if command == "/mark_domain_spam" { + original_url.set_fragment(None); + original_url.set_query(None); + original_url.set_path(""); + sanitized_url.remove_all_but_host(); + } + + let result = insert_or_update_url_with_log( + bot, + database, + Some(sender_name), + &sanitized_url, + &original_url, + designation, + ) + .await?; + + match result { + InsertOrUpdateResult::Inserted { .. } | InsertOrUpdateResult::Updated { .. } => { + if !had_links_changed { + response.push_str(header); + had_links_changed = true; + } + + match result { + InsertOrUpdateResult::Inserted { .. } => { + response.push_str("New: "); + } + InsertOrUpdateResult::Updated { .. } => { + response.push_str("Updated: "); + } + InsertOrUpdateResult::NoChange { .. } => unreachable!(), + } + response.push_str(sanitized_url.as_str()); + response.push('\n'); + } + InsertOrUpdateResult::NoChange { .. } => { + had_links_unchanged = true; + } + } + } + + let footer = match (had_links_unchanged, had_links_changed) { + (false, false) => "\nPlease specify links. Replies don't count to avoid accidents.", + (true, false) => "\nThese URLs are already marked as such.", + (true, true) => "\nSome URLs are skipped as they are already marked as such.", + (false, true) => "", + }; + + response.push_str(footer); + + bot.archsendmsg_no_link_preview(message.chat.id, response.as_str(), message.id) + .await?; + + Ok(()) +} + +/// Handle this message assuming it's the command `/spam` or `/scam`. +async fn handle_command_spam( + bot: &Bot, + message: &Message, + database: &Database, + mut sent_by_admin_cache: Option, +) -> Result<(), RequestError> { + let mut replied_to_sent_by_admin_cache: Option = None; + + // This or replied-to message may have sus links. Time to do stuff! + + // First, check and skip if this is a reply to a post by + // the channel that is linked to this chat. + // This is worth doing because such messages are sanctioned by + // the chat's admins, and are most often just a misclick of + // the blue /spam command in comments to channel posts. + // + // The check for above is accomplished by is_sender_admin function, + // but there are other corner cases to consider too. + + // If this scope runs to the end (i.e. doesn't hit any break), reject this as a reply to an admin. + 'reject_from_admin: { + let Some(reply_to) = message.reply_to_message() else { + // This isn't a reply to anything. + break 'reject_from_admin; + }; + + if message + .from + .as_ref() + .is_some_and(|x| Some(x.id) == reply_to.from.as_ref().map(|x| x.id)) + || message + .sender_chat + .as_ref() + .is_some_and(|x| Some(x.id) == reply_to.sender_chat.as_ref().map(|x| x.id)) + { + // The sender is replying to themselves and knows what they're doing. + break 'reject_from_admin; + } + + if !is_sender_admin_with_cache(bot, message, &mut replied_to_sent_by_admin_cache).await? { + // The sender of the replied-to message isn't an admin. + break 'reject_from_admin; + } + + if is_sender_admin_with_cache(bot, message, &mut sent_by_admin_cache).await? { + // The sender of this message *is* an admin. + break 'reject_from_admin; + } + + // The two checks above will return true for private chats without extra + // Telegram API queries. + + // This is an applicable situation. + let response = concat!( + "Sorry, but the message you're replying to is posted by an admin of this chat, ", + "so it is ignored. If you believe it should be marked, ", + "DM this bot with the command /spam badlink.com to submit it anyway." + ); + bot.archsendmsg_no_link_preview(message.chat.id, response, message.id) + .await?; + return Ok(()); + } + + // Nah, they fr. We actually have to do things now. + + // Get sender names appropriate for notifications. + let sender_name_with_id = sender_name_prettyprint(message, true); + let replied_to_sender_name_with_id = message + .reply_to_message() + .map(|x| sender_name_prettyprint(x, true)) + .unwrap_or_default(); + + // Keep track which categories of links have we seen. + let mut some_marked = false; + let mut some_already_on_review = false; + let mut some_already_marked_spam = false; + let mut some_already_manually_reviewed_as_not_spam = false; + + // The big iterator over all links in this message as well as the message it's a reply to. + // Additionally, for every link, it includes the message it's from, and which by_admin cache to + // use (false for this message's, true for replied to message's). + let the_big_iterator = iterate_over_all_links(message) + .map(|(s, u)| (s, u, message, false, &sender_name_with_id)) + .chain( + message + .reply_to_message() + .into_iter() + .flat_map(iterate_over_all_links) + .map(|(sanitized_url, original_url)| { + ( + sanitized_url, + original_url, + message + .reply_to_message() + .expect("If this is run, then replied-to message must exist."), + true, + &replied_to_sender_name_with_id, + ) + }), + ); + + let mut review_header_posted = false; + + // Iterate over all the links ever. + for ( + sanitized_url, + original_url, + reported_message, + admin_cache_is_of_replied_to, + sender_name, + ) in the_big_iterator + { + let this_sent_by_admin_cache = if admin_cache_is_of_replied_to { + &mut replied_to_sent_by_admin_cache + } else { + &mut sent_by_admin_cache + }; + + let sent_by_admin = + is_sender_admin_with_cache(bot, reported_message, this_sent_by_admin_cache).await?; + + let result = database + .send_to_review(&sanitized_url, &original_url) + .await + .expect("Database died!"); + + match result { + SendToReviewResult::Sent { review_entry_id } => { + if !sent_by_admin { + database + .link_sighted( + reported_message.chat.id, + reported_message.id, + sender_name, + &sanitized_url, + ) + .await + .expect("Database died!"); + } + some_marked = true; + + if !review_header_posted { + send_review_header(bot, reported_message, message, Some(&sender_name_with_id)) + .await?; + review_header_posted = true; + } + + send_review_keyboard( + bot, + CONTROL_CHAT_ID, + None, + review_entry_id, + &sanitized_url, + &original_url, + database, + ) + .await?; + } + SendToReviewResult::AlreadyOnReview => some_already_on_review = true, + SendToReviewResult::AlreadyInDatabase(info) => match info.designation() { + UrlDesignation::Spam => some_already_marked_spam = true, + UrlDesignation::NotSpam | UrlDesignation::Aggregator => { + some_already_manually_reviewed_as_not_spam = true; + } + }, + } + } + + // All links marked as appropriate. Tell the user about it. + let response = if some_marked { + "Thank you, links in this message will be reviewed for spam." + } else if some_already_on_review { + "Thank you, but the links in this message are already marked for review." + } else if some_already_marked_spam { + "Thank you, but links in this message are already marked as spam." + } else if some_already_manually_reviewed_as_not_spam { + "Thank you, but the links in this message were manually reviewed and were determined to be not spam." + } else { + // No links at all. + concat!( + "Sorry, but I could not find any links in ", + "your message or the message you replied to, if any. ", + "This bot only blocks messages with usernames, links, and buttons with links." + ) + }; + + bot.archsendmsg_no_link_preview(message.chat.id, response, message.id) + .await?; + + Ok(()) +} + +/// Handle a callback query, assuming it's from a review keyboard. +pub async fn handle_callback_query( + bot: Bot, + query: CallbackQuery, + database: Arc, +) -> Result<(), RequestError> { + macro_rules! goodbye { + ($text:expr) => {{ + bot.answer_callback_query(query.id).text($text).await?; + return Ok(()); + }}; + () => {{ + bot.answer_callback_query(query.id).await?; + return Ok(()); + }}; + } + + let message = match query.message { + Some(MaybeInaccessibleMessage::Regular(message)) => message, + Some(MaybeInaccessibleMessage::Inaccessible(_)) | None => { + goodbye!("Sorry, this review message is too old."); + } + }; + + if !(message.chat.is_private() || message.chat.id == CONTROL_CHAT_ID) { + // In case someone is messing around and somehow sending this event from a random group + // chat or something. + goodbye!("huhh??? ????? guh??"); + } + + let Some(query_data) = query + .data + .and_then(|d| ReviewCallbackData::deserialize_from_str(&d)) + else { + goodbye!("Failed to parse query data."); + }; + + let user = query.from; + + if !authenticate_control(&bot, &user).await? { + goodbye!("Access denied."); + } + + let entities = message + .parse_entities() + .or_else(|| message.parse_caption_entities()) + .unwrap_or_default(); + + let Some((sanitized_url, original_url)) = entities.iter().find_map(|x| get_entity_url(x)) + else { + goodbye!("Review message without a URL to review. How peculiar!"); + }; + + if query_data.url_crc32 != crc32fast::hash(sanitized_url.as_str().as_bytes()) { + goodbye!("CRC32 hash mismatch! Something is fucky. Report to bot developer lmao"); + } + + let Some(sanitized_url) = sanitized_url.destructure_to_number(query_data.destructure) else { + goodbye!("Too much URL destructuring. Something is fucky. Report to bot developer lmao"); + }; + + // This message is a review keyboard that was just used. + // Remove it from the database and handle it by ourselves afterward. + + database + .review_keyboard_removed(message.chat.id, message.id) + .await + .expect("Database died!"); + + insert_or_update_url_with_log( + &bot, + &database, + Some(&user_name_prettyprint(&user, true)), + &sanitized_url, + &original_url, + query_data.designation, + ) + .await?; + + // We did the funny. Now for the aftermatch. + + if message.chat.is_private() { + // If this is in a private chat, then this is probably a /review keyboard. + edit_message_into_a_new_review_keyboard(&bot, message.chat.id, message.id, &database) + .await?; + } else { + // If this is in the control chat (or whereever else??), edit to remove the keyboard. + discard_review_keyboard( + &bot, + message.chat.id, + message.id, + &user_name_prettyprint(&user, true), + query_data.designation, + &sanitized_url, + ) + .await?; + } + + Ok(()) +} diff --git a/anti_nft_spam_bot/src/handlers/mod.rs b/anti_nft_spam_bot/src/handlers/mod.rs deleted file mode 100644 index da33c2b..0000000 --- a/anti_nft_spam_bot/src/handlers/mod.rs +++ /dev/null @@ -1,1004 +0,0 @@ -use std::{borrow::Cow, fmt::Write, sync::Arc}; - -use arch_bot_commons::{teloxide_retry, useful_methods::BotArchSendMsg}; -use html_escape::encode_text; -use teloxide::{ - prelude::*, - sugar::request::{RequestLinkPreviewExt, RequestReplyExt}, - types::{ - BotCommand, ChatMember, CopyTextButton, Me, MessageEntityKind, MessageEntityRef, MessageId, - SwitchInlineQueryChosenChat, - }, - ApiError, RequestError, -}; -use url::Url; - -use crate::{ - database::Database, - handlers::reviews::DomainIsProtected, - parse_url_like_telegram, sender_name_prettyprint, - types::{Domain, IsSpam, ReviewResponse}, - CONTROL_CHAT_ID, -}; - -pub mod reviews; -use self::reviews::handle_review_command; - -/// Get a domain and a URL from this entity, if available. -fn get_entity_url_domain(entity: &MessageEntityRef) -> Option<(Url, Domain)> { - let mut url = match entity.kind() { - MessageEntityKind::Url | MessageEntityKind::Code | MessageEntityKind::Pre { .. } => { - // Code and Pre because some spammers use monospace to make links unclickable - // but undetectable. - if let Ok(url) = parse_url_like_telegram(entity.text()) { - url - } else { - if *entity.kind() == MessageEntityKind::Url { - // Does not parse as a URL anyway. Shouldn't happen, but eh. - log::warn!("Received an imparsable URL: {}", entity.text()); - } - return None; - } - } - MessageEntityKind::TextLink { url } => url.clone(), - MessageEntityKind::Mention => { - // Text will be like "@amogus" - // Convert it into "https://t.me/amogus" - let username = entity.text().trim_start_matches('@'); - let url_text = format!("https://t.me/{username}"); - - if let Ok(url) = Url::parse(&url_text) { - url - } else { - // Shouldn't happen, but eh. - log::warn!( - "Failed to parse username \"{}\" converted to URL \"{}\"", - entity.text(), - url_text - ); - return None; - } - } - _ => { - return None; - } - }; - // Some telegram spam (like telegram bots) use queries a lot, - // especially referral links in spammed "games". - // Strip those just from telegram URLs. - if crate::spam_checker::is_telegram_url(&url) { - url.set_query(None); - } - - let Some(domain) = Domain::from_url(&url) else { - // Does not have a domain. An IP address link? - log::warn!("Received a URL without a domain: {}", entity.text()); - return None; - }; - - Some((url, domain)) -} - -/// Get a domain and a URL from this button, if available. -fn get_button_url_domain( - button: &teloxide::types::InlineKeyboardButton, -) -> Option<(Cow<'_, Url>, Domain)> { - use teloxide::types::InlineKeyboardButtonKind as Kind; - use teloxide::types::{LoginUrl, WebAppInfo}; - let url = match &button.kind { - Kind::Url(url) - | Kind::LoginUrl(LoginUrl { url, .. }) - | Kind::WebApp(WebAppInfo { url }) => Cow::Borrowed(url), - Kind::SwitchInlineQuery(string) | Kind::CopyText(CopyTextButton { text: string }) => { - Cow::Owned(parse_url_like_telegram(string).ok()?) - } - Kind::SwitchInlineQueryChosenChat(SwitchInlineQueryChosenChat { - query: opt_string, - .. - }) => Cow::Owned(parse_url_like_telegram(opt_string.as_ref()?).ok()?), - Kind::CallbackData(..) - | Kind::Pay(..) - | Kind::SwitchInlineQueryCurrentChat(..) - | Kind::CallbackGame(..) => return None, - }; - - let Some(domain) = Domain::from_url(&url) else { - // Does not have a domain. An IP address link? - log::warn!("Received a URL in a button without a domain: {url}"); - return None; - }; - - Some((url, domain)) -} - -/// Returns `true` if this chat is private. -async fn is_sender_admin(bot: &Bot, message: &Message) -> Result { - if message.chat.is_private() { - return Ok(true); - } - - // First check if a chat sent this, i.e. an anonymous admin. - // In such a case, "from()" returns @GroupAnonymousBot for backwards compatibility. - let is_admin = if let Some(sender_chat) = &message.sender_chat { - if sender_chat.id == message.chat.id { - // If it's posted by the chat itself, it's probably an anonymous admin. - true - } else { - // It may have been sent by the channel linked to this chat, then. - // Check for that. - let chat_full = bot.get_chat(message.chat.id).await?; - - chat_full.linked_chat_id() == Some(sender_chat.id.0) - } - } else if let Some(user) = &message.from { - let ChatMember { kind, .. } = bot.get_chat_member(message.chat.id, user.id).await?; - kind.is_privileged() - } else { - false - }; - - Ok(is_admin) -} - -/// Returns whether or not bad links were present, -/// and any sus links spotted along the way. -pub async fn does_message_have_bad_links( - bot: &Bot, - message: &Message, - database: &Arc, -) -> Result<(bool, Vec), RequestError> { - // Get message "entities". - let entities = message - .parse_entities() - .or_else(|| message.parse_caption_entities()) - .unwrap_or_default(); - - let inline_kb = message.reply_markup().map(|x| &x.inline_keyboard); - let the_unholy_links_iterator = entities - .iter() - .filter_map(get_entity_url_domain) - .map(|x| (Cow::Owned(x.0), x.1)) - .chain( - inline_kb - .iter() - .flat_map(|x| x.iter()) - .flat_map(|x| x.iter()) - .filter_map(get_button_url_domain), - ); - - let mut sus_links_present: Vec = Vec::new(); - - for (url, domain) in the_unholy_links_iterator { - log::debug!("Spotted URL with domain {domain}"); - - let Some((is_spam, from_db)) = crate::spam_checker::check(database, &domain, &url).await - else { - continue; - }; - - if is_spam == IsSpam::Maybe { - sus_links_present.push(url.as_ref().clone()); - - if !from_db { - // Checker above marked the URL as maybe spam. Notify the squad. - create_review_notify(bot, database, message, std::iter::once(url.as_ref()), true) - .await; - } - } - - if is_spam == IsSpam::Yes { - return Ok((true, sus_links_present)); - } - } - - // This message *itself* might not have bad links, but it may be a reply across chats - // to a message that does, with a plea to click on the reply. Handle that too. - if let Some(reply_to) = message.reply_to_message() { - if reply_to.chat.id != message.chat.id { - let result = Box::pin(does_message_have_bad_links(bot, reply_to, database)).await?; - sus_links_present.extend_from_slice(&result.1); - if result.0 { - return Ok((true, sus_links_present)); - } - } - } - - Ok((false, sus_links_present)) -} - -pub async fn handle_message( - bot: Bot, - me: Me, - message: Message, - database: Arc, -) -> Result<(), RequestError> { - handle_message_inner(&bot, &me, &message, &database, false).await?; - - // Also handle the message it's a reply to. - if let Some(replied_to) = message.reply_to_message() { - handle_message_inner(&bot, &me, replied_to, &database, true).await?; - } - - Ok(()) -} - -/// Set `is_replied_to` to true if this message is being handled in context of being an older -/// message that was replied to and is being checked again. If so, this handler will ignore -/// commands and such. -async fn handle_message_inner( - bot: &Bot, - me: &Me, - message: &Message, - database: &Arc, - is_replied_to: bool, -) -> Result<(), RequestError> { - if let Some(sender) = &message.from { - if sender.id == me.id { - // Ignore messages sent by ourselves. - return Ok(()); - } - } - - let is_edited = message.edit_date().is_some(); - - // First check if it's a private message. - if message.chat.is_private() { - if !is_replied_to && !is_edited { - // Will try handling commands at the end of this function too. - if !handle_command(bot, me, message, database, None).await? { - handle_private_message(bot, message).await?; - } - } - return Ok(()); - } - - // Check if it has any links we want to ban. - let (spam_links_present, sus_links_present) = - does_message_have_bad_links(bot, message, database).await?; - - // We may need to check if the sender is an admin in two different places in this function. - // If that happens, store the result determined first and reuse. - // Check the result now, though. - let sent_by_admin: Option = if message.chat.id == CONTROL_CHAT_ID { - // Everyone in control chat is an "admin". - Some(true) - } else if spam_links_present || !sus_links_present.is_empty() { - // If there's spam/sus links, check if it's by an admin - Some(is_sender_admin(bot, message).await?) - } else { - // If no spam links are present, we don't need to care. - None - }; - - let should_delete = if spam_links_present { - // oh no! - if sent_by_admin == Some(true) { - log::debug!("Skipping deleting message from an admin."); - bot.archsendmsg_no_link_preview( - message.chat.id, - "Skipping deleting a message from an admin that contains a spam link.", - None, - ) - .await?; - false - } else { - // Bad links and not an admin. Buh-bye! - true - } - } else { - // No bad links, shouldn't delete. - false - }; - - let sender = sender_name_prettyprint(message, false); - - if should_delete { - delete_spam_message(bot, message.chat.id, message.id, &sender, database).await?; - } else { - // It's (maybe?) not spam. Do the other things. - - if sent_by_admin != Some(true) { - // Mark this message for deletion in case any of the sus links it has get marked as - // spam. - // - // Note: this effectively accumulates links: if a sus link is present, then the message - // is edited to not have it, it would still have it sighted on there. - for url in sus_links_present { - let _ = database - .sus_link_sighted(message, Some(&sender), &url) - .await; - } - } - - if !is_edited && !is_replied_to { - // Deal with unknown sus links... - gather_suspicion(bot, message, sent_by_admin, database).await?; - - if handle_command(bot, me, message, database, sent_by_admin).await? { - return Ok(()); - } - } - } - - Ok(()) -} - -pub async fn delete_spam_message( - bot: &Bot, - chatid: ChatId, - messageid: MessageId, - offending_user_name: &str, - database: &Database, -) -> Result<(), RequestError> { - // Try up to 3 times in case a fail happens lol - for _ in 0..3 { - match bot.delete_message(chatid, messageid).await { - Ok(_) => { - // Make a string, either a @username or full name, - // describing the offending user. - if !database - .get_hide_deletes(chatid) - .await - .expect("Database died!") - { - bot.archsendmsg_no_link_preview( - chatid, - format!( - "Removed a message from {} containing a spam link.", - encode_text(&offending_user_name) - ) - .as_str(), - None, - ) - .await?; - } - break; - } - Err(RequestError::Api( - ApiError::MessageIdInvalid | ApiError::MessageToDeleteNotFound, - )) => { - // Someone else probably has already deleted it. That's fine. - break; - } - Err(RequestError::Api(ApiError::MessageCantBeDeleted)) => { - // No rights? // Older than 48 hours? - bot.archsendmsg_no_link_preview( - chatid, - concat!( - "Tried to remove a message containing a spam link, but failed. ", - "This might be because this bot is not an admin with ability to ", - "remove messages, or the message is older than 48 hours.", - ), - None, - ) - .await?; - break; - } - Err(_) => { - // Random network error or whatever, possibly. - // Try again by letting the loop roll. - } - } - } - - Ok(()) -} - -/// Handler to intuit suspicious links based on them being replied to. -/// -/// The parameter `sent_by_admin` may be specified as `None`; in this case, it will be checked for -/// within this function as needed. -/// -/// If this message starts with /spam or /scam, then the user wants this bot to see them. -async fn gather_suspicion<'a>( - bot: &'a Bot, - message: &'a Message, - mut sent_by_admin: Option, - database: &'a Database, -) -> Result<(), RequestError> { - let Some(text) = message.text() else { - return Ok(()); - }; - - let text = text.to_lowercase(); - - let mut replied_to_sent_by_admin = None; - - // Old check that captured links in a wider net. - // Now our review queue is too big, so this is scaled - // down to only handling the explicit /spam and /scam commands. - //if text.contains("spam") - // || text.contains("scam") - // || text.contains("admin") - // || text.contains("begone") - - if text.starts_with("/spam") | text.starts_with("/scam") { - // This or replied-to message may have sus links. - - // First, check and skip if this is a reply to a post by - // the channel that is linked to this chat. - // This is worth doing because such messages are sanctioned by - // the chat's admins, and are most often just a misclick of - // the blue /spam command in comments to channel posts. - // - // The check for above is accomplished by is_sender_admin function, - // but there are other corner cases to consider too. - - // If this scope runs to the end (i.e. doesn't hit any break), reject this as a reply to an admin. - 'reject_from_admin: { - let Some(reply_to) = message.reply_to_message() else { - // This isn't a reply to anything. - break 'reject_from_admin; - }; - - if message - .from - .as_ref() - .is_some_and(|x| Some(x.id) == reply_to.from.as_ref().map(|x| x.id)) - || message - .sender_chat - .as_ref() - .is_some_and(|x| Some(x.id) == reply_to.sender_chat.as_ref().map(|x| x.id)) - { - // The sender is replying to themselves and knows what they're doing. - break 'reject_from_admin; - } - - if replied_to_sent_by_admin.is_none() { - replied_to_sent_by_admin = Some(is_sender_admin(bot, reply_to).await?); - } - if replied_to_sent_by_admin != Some(true) { - // The sender of the replied-to message isn't an admin. - break 'reject_from_admin; - } - - if sent_by_admin.is_none() { - sent_by_admin = Some(is_sender_admin(bot, message).await?); - } - - if sent_by_admin == Some(true) { - // The sender of this message *is* an admin. - break 'reject_from_admin; - } - - // The two checks above will return true for private chats without extra - // Telegram API queries. - - // This is an applicable situation. - let response = concat!( - "Sorry, but the message you're replying to is posted by an admin of this chat, ", - "so it is ignored. If you believe it should be marked, ", - "DM this bot with the command /spam badlink.com to submit it anyway." - ); - bot.archsendmsg_no_link_preview(message.chat.id, response, message.id) - .await?; - return Ok(()); - } - - // Find and tag the sus links. - - let mut had_links = false; - - let mut already_marked_sus_count = 0u32; - let mut already_marked_spam_count = 0u32; - let mut manually_reviewed_not_spam_count = 0u32; - - let mut links_marked: Vec> = Vec::new(); - - let sendername = sender_name_prettyprint(message, false); - - let mut mark_sus = async |offending_message: &Message, - sent_by_admin: Option, - sendername: &str, - url: Cow<'a, Url>, - domain: &Domain| { - log::debug!("Marking {url} and its domain as sus..."); - - had_links = true; - - let result = database - .mark_sus(&url, Some(domain)) - .await - .expect("Database died!"); - - { - use crate::types::MarkSusResult::*; - - match result { - Marked => { - // Log it, if need be... - if sent_by_admin != Some(true) { - database - .sus_link_sighted(offending_message, Some(sendername), &url) - .await - .expect("Database died!"); - } - links_marked.push(url); - } - AlreadyMarkedSus => already_marked_sus_count += 1, - AlreadyMarkedSpam => already_marked_spam_count += 1, - ManuallyReviewedNotSpam => manually_reviewed_not_spam_count += 1, - } - } - }; - - if let Some(entities) = message - .parse_entities() - .or_else(|| message.parse_caption_entities()) - { - for entity in &entities { - if let Some((url, domain)) = get_entity_url_domain(entity) { - if sent_by_admin.is_none() { - sent_by_admin = Some(is_sender_admin(bot, message).await?); - } - mark_sus( - message, - sent_by_admin, - &sendername, - Cow::Owned(url), - &domain, - ) - .await; - } - } - } - - // Get replied-to message "entities", if any. - if let Some(replied_message) = message.reply_to_message() { - let replied_to_sender_name = sender_name_prettyprint(replied_message, false); - if let Some(replied_entities) = replied_message - .parse_entities() - .or_else(|| replied_message.parse_caption_entities()) - { - for entity in &replied_entities { - let Some((url, domain)) = get_entity_url_domain(entity) else { - continue; - }; - - if replied_to_sent_by_admin.is_none() { - replied_to_sent_by_admin = - Some(is_sender_admin(bot, replied_message).await?); - } - - mark_sus( - replied_message, - replied_to_sent_by_admin, - &replied_to_sender_name, - Cow::Owned(url), - &domain, - ) - .await; - } - } - - // While we're here, check for links in buttons on the replied-to message. - if let Some(markup) = replied_message.reply_markup() { - for row in &markup.inline_keyboard { - for button in row { - let Some((url, domain)) = get_button_url_domain(button) else { - continue; - }; - - if replied_to_sent_by_admin.is_none() { - replied_to_sent_by_admin = - Some(is_sender_admin(bot, replied_message).await?); - } - mark_sus( - replied_message, - replied_to_sent_by_admin, - &replied_to_sender_name, - url, - &domain, - ) - .await; - } - } - } - } - - // We assume there would be no buttons on the /spam message we're - // working for that we need to check. That'd be kind of ridiculous lol - - let mut response; - let marked = !links_marked.is_empty(); - let already_marked_sus = already_marked_sus_count > 0; - let already_marked_spam = already_marked_spam_count > 0; - let manually_reviewed_not_spam = manually_reviewed_not_spam_count > 0; - - //if text.starts_with("/spam") | text.starts_with("/scam") - // This condition is moved to the top of the function now. - { - // That's the bot command, most likely. Users like indication that it does things. - - // Purposefully ambiguous message wording, where "this" both refers to the - // message we're replying to and the message they replied to lol - - let response = if marked { - "Thank you, links in this message will be reviewed for spam." - } else if had_links { - // Didn't mark anything as sus, but the message had links. - - response = String::from("Thank you, but the links in this message are "); - - if already_marked_sus { - response.push_str("already marked for review"); - if already_marked_spam | manually_reviewed_not_spam { - response.push_str(", and some are "); - } - } - - if already_marked_spam { - response.push_str("already marked for spam"); - if manually_reviewed_not_spam { - response.push_str(", and some are "); - } - } - - if manually_reviewed_not_spam { - response.push_str("manually reviewed and were determined to be not spam"); - } - - response.push('.'); - - response.as_str() - } else { - // No links at all. - concat!( - "Sorry, but I could not find any links in ", - "your message or the message you replied to, if any. ", - "This bot only blocks messages with usernames, links, and buttons with links." - ) - }; - bot.archsendmsg_no_link_preview(message.chat.id, response, message.id) - .await?; - } - - if marked { - // We marked something. In this case, notify reviewers to review. - create_review_notify( - bot, - database, - message, - links_marked.iter().map(std::convert::AsRef::as_ref), - false, - ) - .await; - } - } - - Ok(()) -} - -/// Returns `true` if a command was parsed and responded to. -async fn handle_command( - bot: &Bot, - me: &Me, - message: &Message, - database: &Arc, - mut sent_by_admin: Option, -) -> Result { - // Conclusively check if the message was sent by an admin. - let mut byadmin = async || -> Result { - if let Some(sent_by_admin) = sent_by_admin { - Ok(sent_by_admin) - } else { - let result = is_sender_admin(bot, message).await?; - sent_by_admin = Some(result); - Ok(result) - } - }; - - macro_rules! respond { - ($text:expr) => { - bot.archsendmsg_no_link_preview(message.chat.id, $text, message.id) - .await?; - }; - } - - macro_rules! goodbye { - ($text:expr) => {{ - respond!($text); - return Ok(true); - }}; - } - - if message.edit_date().is_some() { - // Ignore message edits here. - return Ok(false); - } - - let is_private = message.chat.is_private(); - - let Some(text) = message.text() else { - return Ok(false); - }; - - // Special case: nag for cash money if someone says "good bot" uwu - static GOOD_BOT: &str = "good bot"; - if let Some((maybe, _)) = text.split_at_checked(GOOD_BOT.len()) { - if maybe.eq_ignore_ascii_case(GOOD_BOT) - && (is_private - || message - .reply_to_message() - .and_then(|x| x.from.as_ref()) - .is_some_and(|x| x.id == me.id)) - { - static NAG: &str = - "(Consider supporting? 👉👈)"; - bot.send_message(message.chat.id, NAG) - .reply_to(message.id) - .parse_mode(teloxide::types::ParseMode::Html) - .disable_link_preview(true) - .await?; - return Ok(true); - } - } - - // Check if it starts with "/", like how a command should. - if !text.starts_with('/') { - return Ok(false); - } - // Get first word in the message, the command itself. - let Some(command) = text.split_whitespace().next() else { - return Ok(false); - }; - - let command_full_len = command.len(); - - // Trim the bot's username from the command and convert to lowercase. - let username = format!("@{}", me.username()); - let command = command.trim_end_matches(username.as_str()).to_lowercase(); - let _params = &text[command_full_len..].trim_start(); - - let command_processed: bool = match command.as_str() { - "/review" if is_private => handle_review_command(bot, message, database).await?, - "/spam" | "/scam" if is_private => { - // This is a private messages only handler. This is already run for public messages - // differently, to catch non-command suspicions, so running it here would run it twice. - gather_suspicion(bot, message, sent_by_admin, database).await?; - true - } - "/hide_deletes" | "/show_deletes" => { - if message.chat.is_private() || !byadmin().await? { - goodbye!("This command can only be used by admins in group chats."); - } - - let new_state = command.as_str() == "/hide_deletes"; - - let old_state = database - .set_hide_deletes(message.chat.id, new_state) - .await - .expect("Database died!"); - - let response = match (old_state, new_state) { - (false, false) => "This chat doesn't hide spam deletion notifications already.", - (false, true) => concat!( - "I will no longer notify about messages being deleted. ", - "Note that this may lead to confusion in case I delete a ", - "message with a legitimate link due to a false positive. " - ), - (true, false) => "From now on I will notify about spam messages being deleted.", - (true, true) => "This chat has spam delete notifications hidden already.", - }; - - goodbye!(response); - } - "/mark_not_spam" | "/mark_url_spam" | "/mark_domain_spam" => { - // If it's not a private/control chat, or no sender, or they're not - // in control chat, pretend we do not see it. - if !(message.chat.is_private() || message.chat.id == CONTROL_CHAT_ID) { - return Ok(false); - } - let Some(sender) = &message.from else { - return Ok(false); - }; - if !reviews::authenticate_control(bot, sender).await? { - return Ok(false); - } - - // Get message "entities". - let Some(entities) = message - .parse_entities() - .or_else(|| message.parse_caption_entities()) - else { - goodbye!("Please specify links. Replies don't count to avoid accidents."); - }; - - let mut response = String::new(); - let mut wrote_header = false; - - // Scan all URLs in the message... - for entity in &entities { - let Some((mut url, domain)) = get_entity_url_domain(entity) else { - continue; - }; - - let (mut action, header) = match command.as_str() { - "/mark_not_spam" => ( - ReviewResponse::NotSpam(Some(domain), url), - "Marked as not spam:\n", - ), - "/mark_url_spam" => ( - ReviewResponse::UrlSpam(Some(domain), url), - "Marked these URLs as spam:\n", - ), - "/mark_domain_spam" => ( - ReviewResponse::DomainSpam(domain, url), - "Marked domains of these URLs as spam:\n", - ), - - _ => unreachable!(), - }; - - let result = - reviews::apply_review_unverified(bot, sender, database, &mut action).await?; - - // Get the URL back lol - url = action.deconstruct().unwrap().1; - - if let Err(DomainIsProtected) = result { - writeln!( - response, - "⚠️ Domain of {url} is protected and cannot be marked as spam." - ) - } else { - if !wrote_header { - response.push_str(header); - wrote_header = true; - } - writeln!(response, "{url}") - } - .expect("String writing is infallible"); - } - - if response.is_empty() { - goodbye!("Please specify links. Replies don't count to avoid accidents."); - } - - goodbye!(response.as_str()); - } - // Any kind of "/start", "/help" commands would yield false and - // hence cause the help message to be printed if this is a private chat. - // See definition of handle_private_message. - _ => false, - }; - - Ok(command_processed) -} - -pub fn generate_bot_commands() -> Vec { - vec![ - BotCommand::new("/hide_deletes", "Hide spam deletion notification messages."), - BotCommand::new( - "/show_deletes", - "Don't hide spam deletion notification messages.", - ), - BotCommand::new("/spam", "Mark links in a message for review as spam."), - ] -} - -pub async fn handle_private_message(bot: &Bot, message: &Message) -> Result<(), RequestError> { - if message.edit_date().is_some() { - // Ignore message edits here. - return Ok(()); - } - // Nothing much to do here lol - - bot.send_message( - message.chat.id, - " -This bot is made to combat the currently ongoing wave of NFT spam experienced by chats with linked channels. - -To use this bot, add it to a chat and give it administrator status with \"Remove messages\" permission. - -No further setup is required. A message will be sent when spam is removed. - -For available commands, type / into the message text box below and see the previews. - -If you're in the group for volunteers to manually review chats, you can also use commands here in private chat: - -/mark_not_spam, /mark_url_spam and /mark_domain_spam" - ) - .await?; - Ok(()) -} - -/// Set `automatic` to true if this review notify was automatically -/// decided by the bot. -pub async fn create_review_notify( - bot: &Bot, - database: &Database, - message: &Message, - links_marked: impl Iterator, - automatic: bool, -) { - let to_review = database.get_review_count().await.expect("Database died!"); - let username_string: String; - let username: &str = if automatic { - "automatic check" - } else { - username_string = sender_name_prettyprint(message, true); - &username_string - }; - - let chatname = if let Some(username) = message.chat.username() { - format!("@{} (chatid {})", username, message.chat.id) - } else if let Some(title) = message.chat.title() { - format!("{} (chatid {})", title, message.chat.id) - } else { - format!("Unknown (chatid {})", message.chat.id) - }; - - use teloxide::types::{InlineKeyboardButton, InlineKeyboardMarkup}; - - // Also create a keyboard for review buttons... - let keyboard = InlineKeyboardMarkup::new(vec![ - vec![ - InlineKeyboardButton::callback( - "Mark URLs spam".to_string(), - "URL_SPAM derive".to_string(), - ), - InlineKeyboardButton::callback( - "Mark DOMAINS spam".to_string(), - "DOMAIN_SPAM derive".to_string(), - ), - ], - vec![InlineKeyboardButton::callback( - "Not spam".to_string(), - "NOT_SPAM derive".to_string(), - )], - ]); - - let mut notify_text = - format!("New link(s) were added to review pool by {username} in {chatname}:\n"); - - use std::fmt::Write; - - for url in links_marked { - let _ = writeln!(notify_text, "URL: {url}\n"); - } - - let _ = writeln!(notify_text, "There are {to_review} links to review."); - - // Forward offending messages. - // It's a good nicety, but it could fail: the messages may be protected from forwarding, or an - // admin might have deleted them in just the right moment, or telegram goes funny again. So, - // honestly, just ignore it failing. - // - // Decide what exactly to forward and how first, though. - match message.reply_to_message() { - None => { - // Just forward this one. - let _ = bot - .forward_message(CONTROL_CHAT_ID, message.chat.id, message.id) - .await; - } - Some(reply_to) if reply_to.chat.id == message.chat.id => { - // In the same chat. Forward them both with this call. - let _ = bot - .forward_messages(CONTROL_CHAT_ID, message.chat.id, [reply_to.id, message.id]) - .await; - } - Some(reply_to) => { - // In different chats. Forward individually. - let _ = bot - .forward_message(CONTROL_CHAT_ID, reply_to.chat.id, reply_to.id) - .await; - let _ = bot - .forward_message(CONTROL_CHAT_ID, message.chat.id, message.id) - .await; - } - } - - if teloxide_retry!( - bot.send_message(CONTROL_CHAT_ID, ¬ify_text) - .parse_mode(teloxide::types::ParseMode::Html) - .reply_markup(keyboard.clone()) - .disable_link_preview(true) - .await - ) - .is_err() - { - log::error!("Failed notifying control chat of new marked sus link!\n{notify_text}"); - } -} diff --git a/anti_nft_spam_bot/src/handlers/reviews.rs b/anti_nft_spam_bot/src/handlers/reviews.rs deleted file mode 100644 index a20fe42..0000000 --- a/anti_nft_spam_bot/src/handlers/reviews.rs +++ /dev/null @@ -1,388 +0,0 @@ -use std::fmt::Write; -use std::sync::Arc; - -use teloxide::{ - payloads::{AnswerCallbackQuerySetters, EditMessageTextSetters}, - requests::Requester, - sugar::request::{RequestLinkPreviewExt, RequestReplyExt}, - types::{ - CallbackQuery, InlineKeyboardButton, InlineKeyboardMarkup, MaybeInaccessibleMessage, - Message, User, - }, - ApiError, Bot, RequestError, -}; - -use crate::{ - database::Database, - types::{IsSpam, ReviewResponse}, - CONTROL_CHAT_ID, REVIEW_LOG_CHANNEL_ID, -}; - -use super::delete_spam_message; - -/// Check if this user is in the control chat and can do reviews, and -/// delay their requests if appropriate. -pub async fn authenticate_control(bot: &Bot, user: &User) -> Result { - let control = bot - .get_chat_member(CONTROL_CHAT_ID, user.id) - .await? - .is_present(); - if !control { - let name = if let Some(username) = &user.username { - format!("@{username}") - } else { - user.full_name() - }; - - log::info!( - "Unauthorized user trying to access reviews: {} (userid {})", - name, - user.id - ); - // Not a member. - // Now, facts: - // 1. This function will only be run in context of a private chat. - // - // 2. Teloxide intentionally processes messages from one chat - // not-concurrently; that is, if we delay now, this will delay - // processing all following direct messages sent by that person - // to this bot. - // - // 3. There is no pertinent reason to DM this bot other than to get - // the help message. - // - // 4. If a user is sending DMs to this bot, that means that they - // have already sent `/start`, and hence have already seen the - // help message. - // - // 5. Therefore, there is no harm to be done by delaying users - // not legible for reviews for DMs. - // - // 6. Bad actors may want to try and spam this bot `/review` to - // cause it to send the above API request many times and in turn - // get rate limited by telegram. - // - // With that in mind, delay this user from accessing this bot for 5 seconds. - tokio::time::sleep(std::time::Duration::from_secs(5)).await; - } - Ok(control) -} - -/// Returns true if the command was processed, or false if it was ignored. -pub async fn handle_review_command( - bot: &Bot, - message: &Message, - database: &Database, -) -> Result { - // Check if it's sent by a user. Otherwise, we don't care. - let Some(user) = &message.from else { - return Ok(false); - }; - - // Check if that user is anyone in the control chat... - - if !authenticate_control(bot, user).await? { - return Ok(false); - } - - // Spawn a review keyboard. - - let message = bot - .send_message(message.chat.id, "Loading review keyboard...") - .reply_to(message.id) - .await?; - - edit_message_into_a_review(bot, database, &message).await?; - - Ok(true) -} - -async fn edit_message_into_a_review( - bot: &Bot, - database: &Database, - message: &Message, -) -> Result<(), RequestError> { - // Telegram's inline keyboards only support up to 128 - // bytes long payload data. We can't hope to store the full - // URL in that, so we store a table name, row ID, and hash of the URL - // instead. - let Some((url, table_name, rowid, is_spam)) = - database.get_url_for_review().await.expect("Database died!") - else { - bot.edit_message_text( - message.chat.id, - message.id, - "There are no more URLs to review.", - ) - .reply_markup(InlineKeyboardMarkup { - inline_keyboard: Vec::new(), - }) - .await?; - return Ok(()); - }; - - let url_hash = crc32fast::hash(url.as_str().as_bytes()); - - let title = match is_spam { - IsSpam::Maybe => "REVIEW:\n\n", - IsSpam::No | IsSpam::Yes => concat!( - "REHASHING: \n", - "There are no more URLs to review right now, ", - "so existing entries are shown to weed out ", - "any potential false positives.\n\n" - ), - }; - - let considered = match is_spam { - IsSpam::No => concat!( - "This URL is currently NOT considered as spam, ", - "but is presented for review in case it's wrong.\n\n" - ), - IsSpam::Yes => concat!( - "This URL is currently considered as spam, ", - "but is presented for review in case it's a false positive.\n\n" - ), - IsSpam::Maybe => "", - }; - - let text = format!("{title}{considered}{url}\n\nWhat is spam here?"); - - let keyboard = InlineKeyboardMarkup::new(vec![ - vec![ - InlineKeyboardButton::callback( - "Just the URL".to_string(), - format!("URL_SPAM {table_name} {rowid} {url_hash}"), - ), - InlineKeyboardButton::callback( - "Entire DOMAIN".to_string(), - format!("DOMAIN_SPAM {table_name} {rowid} {url_hash}"), - ), - ], - vec![ - InlineKeyboardButton::callback( - "Not spam".to_string(), - format!("NOT_SPAM {table_name} {rowid} {url_hash}"), - ), - InlineKeyboardButton::callback("Skip".to_string(), "SKIP".to_string()), - ], - ]); - - let edit_result = bot - .edit_message_text(message.chat.id, message.id, text) - .parse_mode(teloxide::types::ParseMode::Html) - .reply_markup(keyboard) - .await; - - // If we get this error, that means that the message was modified to the - // exact same thing as it was before. This means we're getting the same thing. - if let Err(RequestError::Api(ApiError::MessageNotModified)) = edit_result { - bot.edit_message_text( - message.chat.id, - message.id, - "There are no more URLs to review.", - ) - .reply_markup(InlineKeyboardMarkup { - inline_keyboard: Vec::new(), - }) - .await?; - return Ok(()); - } - - edit_result?; - Ok(()) -} - -pub async fn parse_callback_query( - bot: Bot, - query: CallbackQuery, - db: Arc, -) -> Result<(), RequestError> { - macro_rules! goodbye { - ($text:expr) => {{ - bot.answer_callback_query(query.id).text($text).await?; - return Ok(()); - }}; - () => {{ - bot.answer_callback_query(query.id).await?; - return Ok(()); - }}; - } - - let Some(query_data) = query.data else { - goodbye!("No query data."); - }; - - let message = match &query.message { - Some(MaybeInaccessibleMessage::Regular(message)) => Some(message.as_ref()), - Some(MaybeInaccessibleMessage::Inaccessible(_)) | None => None, - }; - - let user = query.from; - - let mut responses = match ReviewResponse::from_str(query_data.as_str(), &db, message).await { - Ok(r) => r, - Err(e) => { - goodbye!(&format!("Invalid query data: {e}")); - } - }; - - if responses.is_empty() { - goodbye!("Nothing to mark here...???"); - } - - // First check for protected domains. If so, do nothing, bail out. - for response in &mut responses { - if let Some(protected_domain) = response - .conflicts_with_protected_domains(&db) - .await - .expect("Database died!") - { - goodbye!(format!( - "⚠️ Domain {} is protected and cannot be marked as spam.", - protected_domain - )); - }; - } - - for response in &mut responses { - match apply_review(&bot, &user, &db, response).await? { - Err(Unauthorized) => goodbye!("Access denied."), - Ok(Ok(())) => (), - Ok(Err(DomainIsProtected)) => { - goodbye!("⚠️ This domain is protected and cannot be marked as spam.") - } - }; - } - - let Some(message) = message else { - // May happen if the message is too old - goodbye!("Review taken. Please send /review to perform more reviews."); - }; - - // Avoid editing the message into reviews if it's not in private i.e. in work chat - if message.chat.is_private() { - edit_message_into_a_review(&bot, &db, message).await?; - } else { - // It's a notification about newly marked URLs that was just reviewed on. - // Edit it to get rid of the buttons and stuff. - - let name = if let Some(username) = &user.username { - format!("@{username}") - } else { - user.full_name() - }; - - let mut text = format!("Handled by {} (userid {}):\n", name, user.id); - for response in &responses { - let _ = writeln!(&mut text, "{response}"); - } - - if let Some(msgtext) = message.text() { - let _ = write!(&mut text, "\nOriginal message text:\n{msgtext}"); - } - - bot.edit_message_text(message.chat.id, message.id, text) - .disable_link_preview(true) - .await?; - } - goodbye!(); -} - -#[derive(Debug)] -pub struct Unauthorized; -#[derive(Debug)] -pub struct DomainIsProtected; - -/// Apply this review response as coming from this user. -/// -/// Returns true if succeeded, false if the user is not in control chat. -pub async fn apply_review( - bot: &Bot, - user: &User, - db: &Arc, - response: &mut ReviewResponse, -) -> Result, Unauthorized>, RequestError> { - // ....shush. - - if !authenticate_control(bot, user).await? { - return Ok(Err(Unauthorized)); - } - - Ok(Ok(apply_review_unverified(bot, user, db, response).await?)) -} - -/// Apply this review response as coming from this user. -/// -/// Will not check if this user actually is in control chat. -/// -/// `response` is taken mutably because the function may populate the `domain` field of some -/// variants. -pub async fn apply_review_unverified( - bot: &Bot, - user: &User, - db: &Arc, - response: &mut ReviewResponse, -) -> Result, RequestError> { - if response - .conflicts_with_protected_domains(db) - .await - .expect("Database died!") - .is_some() - { - return Ok(Err(DomainIsProtected)); - } - - // See if it should be written into the log... - let should_be_logged = response - .conflicts_with_db(db) - .await - .expect("Database died!"); - - // Before we apply it to the database, deal with sightings of this link - // (and potentially other bad links lol) - if let Some((_, url)) = response.as_ref_url_domain() { - if response.marks_as_spam() { - let sightings = db - .drain_all_sightings_of_spam(url) - .await - .expect("Database died!"); - let db = db.clone(); - let bot = bot.clone(); - tokio::spawn(async move { - for (chatid, messageid, offending_user_name) in sightings { - let _ = delete_spam_message(&bot, chatid, messageid, &offending_user_name, &db) - .await; - } - }); - } else { - db.delete_all_sightings_of(url) - .await - .expect("Database died!"); - } - } - - // Ingest it into the database... - db.read_review_response(response) - .await - .expect("Database died!"); - - // Write it to the log... - if should_be_logged { - // Something wasn't marked as spam, but now will be. - // This warrants logging. - - let name = if let Some(username) = &user.username { - format!("@{username}") - } else { - user.full_name() - }; - - let log_message = format!("{} (userid {})\n{}", name, user.id, response); - - bot.send_message(REVIEW_LOG_CHANNEL_ID, log_message) - .disable_link_preview(true) - .await?; - } - - Ok(Ok(())) -} diff --git a/anti_nft_spam_bot/src/lib.rs b/anti_nft_spam_bot/src/lib.rs index 483c5f3..761f1cc 100644 --- a/anti_nft_spam_bot/src/lib.rs +++ b/anti_nft_spam_bot/src/lib.rs @@ -1,13 +1,31 @@ +//! Source code for Anti NFT Spam Bot, aka `@Anti_NFT_Spam_Bot` on Telegram. + +/// Sanitized URL type. Probably should go into types lol +mod sanitized_url; + +/// Various types used throughout. +mod types; + +/// The database. mod database; -mod entry; + +/// Miscellaneous functions. +mod misc; + +/// Functions that perform stuff via the bot. +mod actions; + +/// Functions that handle events from Telegram. mod handlers; + +/// Spam checker functionality. mod spam_checker; -mod types; +/// Entry function that starts the bot. +mod entry; pub use entry::*; -use teloxide::types::{ChatId, Message}; -use url::Url; +use teloxide::types::ChatId; /// An ID of a private chat with the developers of the bot, /// as well as volunteers who partake in manual review of links for spam. @@ -17,61 +35,3 @@ pub static CONTROL_CHAT_ID: ChatId = ChatId(-1002065680710); /// This is primarily to spot abuse and to note which URLs the bot /// could have caught automatically but did not. pub static REVIEW_LOG_CHANNEL_ID: ChatId = ChatId(-1002128704357); - -/// Try to parse a string as a [`Url`] in a way that telegram parses it, -/// with allowing an implicit `http://` prefix. -/// -/// # Errors -/// Errors if it fails to parse either way. -pub fn parse_url_like_telegram(string: &str) -> Result { - match Url::parse(string) { - Ok(url) => Ok(url), - Err(e) => { - // We want to return this original error if the next step fails. - if let Ok(url) = Url::parse(&format!("http://{string}")) { - Ok(url) - } else { - Err(e) - } - } - } -} - -#[must_use] -pub fn sender_name_prettyprint(message: &Message, with_id: bool) -> String { - let mut userid = None; - let mut chatid = None; - let mut name = if let Some(chat) = &message.sender_chat { - chatid = Some(chat.id); - if let Some(username) = chat.username() { - format!("@{} (chatid {})", username, chat.id) - } else if let Some(title) = chat.title() { - title.to_string() - } else { - // Shouldn't happen, but eh. - "a private user".to_string() - } - } else if let Some(user) = &message.from { - userid = Some(user.id); - if let Some(username) = &user.username { - format!("@{username}") - } else { - user.full_name().to_string() - } - } else { - // Shouldn't happen either, but eh. - "a private user".to_string() - }; - - if with_id { - use std::fmt::Write; - if let Some(userid) = userid { - let _ = write!(name, " (userid {userid})"); - } - if let Some(chatid) = chatid { - let _ = write!(name, " (chatid {chatid})"); - } - } - - name -} diff --git a/anti_nft_spam_bot/src/main.rs b/anti_nft_spam_bot/src/main.rs index a709b04..24c81c9 100644 --- a/anti_nft_spam_bot/src/main.rs +++ b/anti_nft_spam_bot/src/main.rs @@ -1,9 +1,9 @@ +//! Starts Anti NFT Spam Bot lol + use arch_bot_commons::*; fn main() { - if std::env::var_os("RUST_LOG").is_none() { - // TODO: Audit that the environment access only happens in single-threaded code. - unsafe { std::env::set_var("RUST_LOG", "WARN,anti_nft_spam_bot=debug") }; - } + // SAFETY: No other threads were made yet. + unsafe { std::env::set_var("RUST_LOG", "WARN,anti_nft_spam_bot=debug") }; start_everything(anti_nft_spam_bot::entry()); } diff --git a/anti_nft_spam_bot/src/misc.rs b/anti_nft_spam_bot/src/misc.rs new file mode 100644 index 0000000..8096d1c --- /dev/null +++ b/anti_nft_spam_bot/src/misc.rs @@ -0,0 +1,270 @@ +use teloxide::{ + prelude::Requester, + types::{Chat, ChatMember, InlineKeyboardButton, Message, MessageEntityRef, User}, + Bot, RequestError, +}; +use url::Url; + +use crate::{ + database::Database, sanitized_url::SanitizedUrl, spam_checker::is_url_spam, CONTROL_CHAT_ID, +}; + +/// Try to parse a string as a [`Url`] in a way that telegram parses it, +/// with allowing an implicit `https://` prefix, or as a username. +/// +/// # Errors +/// Errors if it fails to parse either way. +pub fn parse_url_like_telegram(string: &str) -> Result { + if let Some(username) = string.strip_prefix('@') { + // Probably a username like "@amogus" + // Convert to a format like "https://t.me/amogus" then parse + return Url::parse(&format!("https://t.me/{username}")); + } + + match Url::parse(string) { + Ok(url) => Ok(url), + Err(e @ url::ParseError::RelativeUrlWithoutBase) => { + // Try prepending https:// to it + if let Ok(url) = Url::parse(&format!("https://{string}")) { + Ok(url) + } else { + Err(e) + } + } + Err(e) => Err(e), + } +} + +/// Tries to print the user in the prettiest way possible, with either `@username` or full name +/// that hopefully links to the user. Optionally allows including user ID. +#[must_use] +pub fn user_name_prettyprint(user: &User, with_id: bool) -> String { + let mut name = { + if let Some(username) = &user.username { + format!("@{username}") + } else { + let mut full_name = format!("{}", user.id, user.first_name); + + if let Some(last_name) = &user.last_name { + full_name.push(' '); + full_name.push_str(last_name); + } + + full_name.push_str(""); + + full_name + } + }; + + if with_id { + use std::fmt::Write; + write!(name, " (userid {})", user.id).expect("Writing to a String never fails"); + } + + name +} + +/// Tries to print the chat name in the prettiest way possible, with either `@username` or chat +/// title or full name. +#[must_use] +pub fn chat_name_prettyprint(chat: &Chat, with_id: bool) -> String { + let mut name = if let Some(username) = chat.username() { + format!("@{username}") + } else if let Some(title) = chat.title() { + title.to_string() + } else if let Some(first_name) = chat.first_name() { + let mut full_name = first_name.to_string(); + + if let Some(last_name) = chat.last_name() { + full_name.push(' '); + full_name.push_str(last_name); + } + full_name + } else { + // Shouldn't happen, but eh. + "a private chat".to_string() + }; + + if with_id { + use std::fmt::Write; + write!(name, " (chatid {})", chat.id).expect("Writing to a String never fails"); + } + + name +} + +/// Tries to print the name of the sender of this message, using either [`user_name_prettyprint`] +/// or [`chat_name_prettyprint`]. +#[must_use] +pub fn sender_name_prettyprint(message: &Message, with_id: bool) -> String { + if let Some(chat) = &message.sender_chat { + chat_name_prettyprint(chat, with_id) + } else if let Some(user) = &message.from { + user_name_prettyprint(user, with_id) + } else { + // Shouldn't happen, but eh. + "a private sender".to_string() + } +} + +/// Get a URL from this message entity, if available. +#[must_use] +pub fn get_entity_url(entity: &MessageEntityRef) -> Option<(SanitizedUrl, Url)> { + use teloxide::types::MessageEntityKind as Kind; + + match entity.kind() { + Kind::Url | Kind::Code | Kind::Pre { .. } | Kind::Mention => { + // Code and Pre because some spammers use monospace to make links clickable but + // undetectable. Mentions are links too. + SanitizedUrl::from_str_with_original(entity.text()) + } + Kind::TextLink { url } => SanitizedUrl::from_url_with_original(url.clone()), + Kind::TextMention { user } => user + .username + .as_ref() + .map(|u| format!("https://t.me/{u}")) + .and_then(|s| SanitizedUrl::from_str_with_original(&s)), + _ => None, + } +} + +/// Get a URL from this button, if available. +#[must_use] +pub fn get_button_url(button: &InlineKeyboardButton) -> Option<(SanitizedUrl, Url)> { + use teloxide::types::InlineKeyboardButtonKind as Kind; + use teloxide::types::{CopyTextButton, LoginUrl, SwitchInlineQueryChosenChat, WebAppInfo}; + + match &button.kind { + Kind::Url(url) + | Kind::LoginUrl(LoginUrl { url, .. }) + | Kind::WebApp(WebAppInfo { url }) => SanitizedUrl::from_url_with_original(url.clone()), + Kind::SwitchInlineQuery(string) + | Kind::CopyText(CopyTextButton { text: string }) + | Kind::SwitchInlineQueryCurrentChat(string) => { + SanitizedUrl::from_str_with_original(string) + } + Kind::SwitchInlineQueryChosenChat(SwitchInlineQueryChosenChat { + query: opt_string, + .. + }) => opt_string + .as_ref() + .and_then(|s| SanitizedUrl::from_str_with_original(s)), + _ => None, + } +} + +/// Checks if the sender of this message is an admin. Returns `true` if this is a private chat +/// between the bot and the user. +pub async fn is_sender_admin(bot: &Bot, message: &Message) -> Result { + if message.chat.is_private() { + return Ok(true); + } + + if message.chat.id == CONTROL_CHAT_ID { + // Everyone in the control chat is an "admin". + return Ok(true); + } + + // check if a chat sent this, i.e. an anonymous admin. + // In such a case, "from()" returns @GroupAnonymousBot for backwards compatibility. + let is_admin = if let Some(sender_chat) = &message.sender_chat { + if sender_chat.id == message.chat.id { + // If it's posted by the chat itself, it's probably an anonymous admin. + true + } else { + // It may have been sent by the channel linked to this chat, then. + // Check for that. + let chat_full = bot.get_chat(message.chat.id).await?; + + chat_full.linked_chat_id() == Some(sender_chat.id.0) + } + } else if let Some(user) = &message.from { + let ChatMember { kind, .. } = bot.get_chat_member(message.chat.id, user.id).await?; + kind.is_privileged() + } else { + false + }; + + Ok(is_admin) +} + +/// Convenience function around [`is_sender_admin`] to use with a variable that holds the result if +/// it was already computed prior. +pub async fn is_sender_admin_with_cache( + bot: &Bot, + message: &Message, + cache: &mut Option, +) -> Result { + if let Some(cached) = cache { + return Ok(*cached); + } + + let result = is_sender_admin(bot, message).await?; + *cache = Some(result); + Ok(result) +} + +/// Iterate over all links that incriminate this message. This includes message entities, buttons, +/// and, if the message is a reply to a message in another chat, all of the above of that message +/// too. +pub fn iterate_over_all_links( + message: &Message, +) -> impl Iterator + Send + Sync + '_ { + macro_rules! the_unholy_links_iterator { + ($message: expr) => { + $message + .parse_entities() + .or_else(|| message.parse_caption_entities()) + .unwrap_or_default() + .into_iter() + .filter_map(|x| get_entity_url(&x)) + .chain( + message + .reply_markup() + .map(|x| &x.inline_keyboard) + .into_iter() + .flat_map(|x| x.iter()) + .flat_map(|x| x.iter()) + .filter_map(get_button_url), + ) + }; + } + + // This message *itself* might not have bad links, but it may be a reply across chats + // to a message that does, with a plea to click on the reply. Handle that too. + the_unholy_links_iterator!(message).chain( + message + .reply_to_message() + .filter(|reply_to| reply_to.chat.id != message.chat.id) + .into_iter() + .flat_map(|reply_to| the_unholy_links_iterator!(reply_to)), + ) +} + +/// Returns true if any of the links in this message are spam, or if it's a reply to a message in +/// another chat that does. +pub async fn does_message_have_spam_links(message: &Message, database: &Database) -> bool { + for (sanitized_url, _original_url) in iterate_over_all_links(message) { + if is_url_spam(database, &sanitized_url).await { + return true; + } + } + + false +} + +#[cfg(test)] +mod tests { + #![allow(clippy::unwrap_used)] + + use super::*; + #[test] + fn parsing_url_like_telegram() { + let url = parse_url_like_telegram("https://example.com/").unwrap(); + assert_eq!(url.as_str(), "https://example.com/"); + let url = parse_url_like_telegram("example.com").unwrap(); + assert_eq!(url.as_str(), "https://example.com/"); + let url = parse_url_like_telegram("@amogus").unwrap(); + assert_eq!(url.as_str(), "https://t.me/amogus"); + } +} diff --git a/anti_nft_spam_bot/src/sanitized_url.rs b/anti_nft_spam_bot/src/sanitized_url.rs new file mode 100644 index 0000000..849f7d2 --- /dev/null +++ b/anti_nft_spam_bot/src/sanitized_url.rs @@ -0,0 +1,714 @@ +use std::{borrow::Cow, fmt::Display, str::FromStr}; + +use url::{Host, Url}; + +use crate::misc::parse_url_like_telegram; + +fn is_host_an_ip_address(url: &Url) -> bool { + matches!(url.host(), Some(Host::Ipv4(_) | Host::Ipv6(_))) +} + +/// Normalize percent-encoding and lowercase the ASCII parts of the text. +pub fn normalize(input: &str, output: &mut String) { + use percent_encoding::*; + + // All non-printable characters, but also + // all whitespace and separators for URL paths and query separators, and percent itself lol + const THIS_ASCII_SET: AsciiSet = CONTROLS + .add(b'%') + .add(b'&') + .add(b'=') + .add(b' ') + .add(b'+') + .add(b'/') + .add(b'\\'); + + if input.is_empty() { + return; + } + + // Percent decode. + let mut data: Cow<'_, [u8]> = percent_decode(input.as_bytes()).into(); + + // Replace all pluses with whitespace, if there's any. + if let Some(first_plus) = data.iter().position(|x| *x == b'+') { + let mut data_owned = data.into_owned(); + let has_pluses = &mut data_owned[first_plus..]; + has_pluses + .iter_mut() + .map(|x| { + if *x == b'+' { + *x = b' '; + } + }) + .last(); + + data = data_owned.into(); + } + // Now percent encode. + let percent_normalized = percent_encode(&data, &THIS_ASCII_SET); + + // This happens *after* percent-encoding, so percent-encoded characters are not + // lowercased. Only ASCII characters can be lowercased here, so use ASCII lowercasing. + let lowercased = percent_normalized + .flat_map(|x| x.chars()) + .map(|c| c.to_ascii_lowercase()); + + lowercased.map(|c| output.push(c)).last(); +} + +/// Convenience wrapper around [`normalize`] that returns a new string with the result. +#[must_use] +pub fn normalize_new_string(input: &str) -> String { + let mut output = String::with_capacity(input.len()); + normalize(input, &mut output); + output +} + +/// A URL with various guarantees applied. See [`Self::new`] for details. +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub struct SanitizedUrl(Url); + +impl SanitizedUrl { + /// Sanitizes an input URL in a destructive manner. In particular, these rules are applied: + /// + /// * URLs that have a weird scheme, have no host, or are incomplete, are rejected; [`None`] is + /// returned. + /// * Scheme is set to `https`. + /// * Fragment (like "#hello" at the end) is discarded. + /// * Username and password (like `name:pass@example.com`) are discarded. + /// * Port specification is discarded. + /// * Host (IP address or domain name) is [normalize]d. + /// * Each individual segment of the path is [normalize]d; empty ones are removed. This breaks + /// some URLs with case-sensitive websites. + /// * Trailing "/" at the end of the path is trimmed. + /// * Modifications to the URL may be applied based on the domain name; for example, `youtu.be` + /// links are rewritten to `youtube.com` links and query (like "?a&b&c=d" at the end) is + /// discarded. + /// * Each individual query parameter's key and value (if any) is [normalize]d. This breaks some + /// URLs with case-sensitive parameters. + /// * Query parameters are alphabetically sorted and deduplicated. + #[must_use] + #[allow(clippy::missing_panics_doc)] // Cannot panic + pub fn new(mut url: Url) -> Option { + // For .expect(…) calls + static CAN_BE_A_BASE: &str = + "URL shouldn't be cannot-be-a-base due to check at start of function"; + + if url.scheme() == "file" || !url.has_host() || url.cannot_be_a_base() { + return None; + } + if url.scheme() != "https" { + // This discards a bunch of weird, likely invalid URLs while we're at it. + url.set_scheme("https").ok()?; + } + + // Normalize the host. + { + let host = url.host_str().expect("Check above ensures host is present"); + let normalized = normalize_new_string(host); + let normalized = normalized.trim_start_matches("www."); + url.set_host(Some(normalized)) + .expect("Normalizing host should not fail"); + } + + // Some URLs like Signal's use fragments for security. + // We'd like to wipe fragments here, which destroys those URLs too much, so preserve the + // fragment before doing so. + + if let Some(fragment) = url.fragment().filter(|f| !f.is_empty()) { + let host_str = url.host_str().expect("Host str should exist at this point"); + + match host_str { + "signal.me" | "signal.group" | "signal.link" | "signal.tube" | "signal.art" => { + let new_path = format!("{}/fragment__{}", url.path(), fragment); + url.set_path(&new_path); + } + _ => (), + } + } + + url.set_fragment(None); + url.set_username("").ok()?; + url.set_password(None).ok()?; + url.set_port(None).ok()?; + + // Normalize path via individual segments. + // This is because "example.com/a/b" and "example.com/a%2Fb" are two different things even + // if they percent-decode to the same thing. + // + // This invalidates some URLs since some of them are case sensitive. + // This is fine, it is exceedingly unlikely that a URL is spam but another URL that has the + // exact same letters in it but with different casing isn't. + { + let mut normalized_path = String::new(); + for segment in url.path_segments().expect(CAN_BE_A_BASE) { + // Skip empty segments. + if segment.is_empty() { + continue; + } + normalized_path.push('/'); + normalize(segment, &mut normalized_path); + } + + url.set_path(&normalized_path); + } + + if !is_host_an_ip_address(&url) { + // Domain specific modifications to the URL + let host_str = url.host_str().expect("Host str should exist at this point"); + match host_str { + "t.me" | "telegram.me" | "telegram.dog" => { + if host_str != "t.me" { + url.set_host(Some("t.me")).expect("t.me is a valid host"); + } + + // Query params don't matter much for Telegram links. + url.set_query(None); + + // Strip all path stuff after the first path segment. + // So, for example, turn a link like "https://t.me/Architector_4/blah/blah" + // into "https://t.me/Architector_4" + let path_segments_count = url.path().chars().filter(|x| *x == '/').count(); + let mut segments = url.path_segments_mut().expect(CAN_BE_A_BASE); + for _ in 1..path_segments_count { + segments.pop(); + } + } + x if x.ends_with(".t.me") => { + // It's a link like https://architector4.t.me/ + // Translate to a normal username link. + + // url.set_*() function calls are arranged to reduce top allocated memory at any + // point. Probably doesn't matter, but eh. + + url.set_query(None); + + // I'm unsure if links like https://foo.bar.t.me/ might exist, + // so I'm assuming that everything before ".t.me" in the host is a username. + let host_str = url.host_str().expect("Host str should exist at this point"); + let username = host_str.trim_end_matches(".t.me").to_string(); + + url.set_host(Some("t.me")).expect("t.me is a valid host"); + + url.set_path(&username); + } + "youtu.be" => { + // Example URL: https://youtu.be/dQw4w9WgXcQ?blahblah + // We want to convert this to a normal YouTube URL, disregarding the query part. + let query = format!("v={}", &url.path()[1..]); + url.set_host(Some("youtube.com")) + .expect("youtube.com is a valid host"); + url.set_path("watch"); + url.set_query(Some(&query)); + } + "youtube.com" | "m.youtube.com" => { + if host_str != "youtube.com" { + url.set_host(Some("youtube.com")) + .expect("youtube.com is a valid host"); + } + + if url.path() == "/watch" { + // A link to a video. Find the video query param, isolate it, remove all + // parameters, then add it. + // + // Video param may be not present. In that case, this code just clears the + // params entirely. That's fine. + let video_param = url + .query() + .and_then(|q| { + q.split('&').find(|param| { + // The param may be percent-encoded and/or uppercase. + // We need to check if it starts with "v=" or "%76=" + let mut chars = param.chars().map(|x| x.to_ascii_lowercase()); + + match chars.next() { + Some('v') => chars.next() == Some('='), + Some('%') => { + chars.next() == Some('%') + && chars.next() == Some('7') + && chars.next() == Some('6') + && chars.next() == Some('=') + } + _ => false, + } + }) + }) + .map(ToString::to_string); + + url.set_query(video_param.as_deref()); + } else if let Some((_, video_id)) = url.path().split_once("/shorts/") { + // A YouTube™ Shorts™ video. Unshortsing. + let video_param = format!("v={video_id}"); + + url.set_path("watch"); + url.set_query(Some(&video_param)); + } + } + "fixupx.com" | "fxtwitter.com" | "girlcockx.com" | "mobile.twitter.com" + | "mobile.x.com" | "stupidpenisx.com" | "twitter.com" | "vxtwitter.com" + | "x.com" | "hitlerx.com" | "cunnyx.com" | "fixvx.com" => { + if host_str != "twitter.com" { + url.set_host(Some("twitter.com")) + .expect("twitter.com is a valid host"); + } + + // If this is a tweet, extract its ID and exclude the username handle. + let mut segments = url.path_segments().expect(CAN_BE_A_BASE); + + if segments.nth(1) == Some("status") { + if let Some(tweet_id) = segments.next() { + let new_path = format!("i/status/{tweet_id}"); + url.set_path(&new_path); + } + } + + // Query params never meaningfully matter on Twitter, as far as I can tell. + url.set_query(None); + } + _ => {} + } + } + + // Normalize query via individual parameters, if there's any. + // This kills some URLs too. Same caveat as above. + if let Some(query) = url.query() { + if query.is_empty() { + // If empty, just remove it. + url.set_query(None); + } else { + let mut params: Vec = Vec::new(); + let mut last_param: Option<&str> = None; + + for param in query.split('&') { + if last_param == Some(param) { + // Immediate duplicate. Skip. + continue; + } + last_param = Some(param); + + let (key, val) = param.split_once('=').unwrap_or((param, "")); + + let mut param_normalized = String::with_capacity(key.len() + 1 + val.len()); + + normalize(key, &mut param_normalized); + if !val.is_empty() { + param_normalized.push('='); + normalize(val, &mut param_normalized); + } + + params.push(param_normalized); + } + + // Sort by alphabet ascending, + params.sort_unstable(); + // Proper deduping after the sort. + params.dedup(); + + let normalized_params = params.join("&"); + + if normalized_params.is_empty() { + url.set_query(None); + } else { + url.set_query(Some(&normalized_params)); + } + } + } + + Some(Self(url)) + } + + /// Returns the serialization of this URL. + #[must_use] + pub fn as_str(&self) -> &str { + self.0.as_str() + } + + /// Returns the host (a domain name or an IP address) in this URL. + #[allow(clippy::missing_panics_doc)] // Cannot panic + #[allow(unused)] + #[must_use] + pub fn host(&self) -> Host<&str> { + self.as_ref() + .host() + .expect("SanitizedUrl guarantees URL has a host") + } + + /// Returns whether or not the host in this URL is an IP address. + #[must_use] + #[allow(clippy::missing_panics_doc)] // Cannot panic + pub fn is_host_an_ip_address(&self) -> bool { + is_host_an_ip_address(self.as_ref()) + } + + /// Returns the host (a domain name or an IP address) in this URL as a string. + #[must_use] + #[allow(clippy::missing_panics_doc)] // Cannot panic + pub fn host_str(&self) -> &str { + self.as_ref() + .host_str() + .expect("SanitizedUrl guarantees URL has a host") + } + + /// Returns the `?query` part in this URL, if any. + /// + /// Either [`None`] or a non-empty string. + #[must_use] + pub fn query(&self) -> Option<&str> { + self.as_ref().query() + } + + /// Returns a path in this URL. Guaranteed to start with `/`. + #[must_use] + pub fn path(&self) -> &str { + self.as_ref().path() + } + + /// Returns both sanitized result and original. + #[must_use] + pub fn from_url_with_original(url: Url) -> Option<(Self, Url)> { + Some((Self::new(url.clone())?, url)) + } + + /// Parses the string to an [`Url`] and returns both that and the sanitized result. + #[must_use] + pub fn from_str_with_original(s: &str) -> Option<(Self, Url)> { + let url = parse_url_like_telegram(s).ok()?; + Some((Self::new(url.clone())?, url)) + } + + /// Removes all parts of the URL except the host and the protocol. + pub fn remove_all_but_host(&mut self) { + self.0.set_fragment(None); + self.0.set_query(None); + self.0.set_path(""); + } + + /// Return an iterator that destructures the URL. + /// See [`SanitizedUrlDestructureIter`] for more details. + #[must_use] + pub fn destructure(&self) -> SanitizedUrlDestructureIter<'_> { + SanitizedUrlDestructureIter::new(self) + } + + /// How many times the URL should be destructured. 0 for none, 1 for same host and path but no query + /// (if there was no query in the first place, means the same thing as 0), 2 and onward as + /// iterations over [`SanitizedUrlDestructureIter`]. + /// + /// Returns the result of destructuring this URL this many times, or [`None`] if it was + /// destructured too much. + #[must_use] + pub fn destructure_to_number(&self, count: u64) -> Option { + if count == 0 { + Some(self.clone()) + } else { + let mut destructurer = self.destructure(); + let mut host = ""; + let mut path = ""; + for _ in 1..=count { + (host, path) = destructurer.next()?; + } + + let output = SanitizedUrl::from_str(&format!("https://{host}{path}")) + .expect("Host and path from SanitizedUrl are guaranteed to be valid"); + + Some(output) + } + } +} + +impl Display for SanitizedUrl { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } +} + +impl AsRef for SanitizedUrl { + fn as_ref(&self) -> &Url { + &self.0 + } +} + +impl FromStr for SanitizedUrl { + type Err = (); + fn from_str(s: &str) -> Result { + parse_url_like_telegram(s) + .ok() + .and_then(SanitizedUrl::new) + .ok_or(()) + } +} + +/// An iterator that destructures a [`SanitizedUrl`] and returns an iterator of tuples of host and +/// path. Query part is ignored. +/// +/// For example, a link like would +/// destructure to: +/// +/// * Some(("a.b.c.example.com", "/some/funky/path")) +/// * Some(("a.b.c.example.com", "/some/funky")) +/// * Some(("a.b.c.example.com", "/some")) +/// * Some(("a.b.c.example.com", "/")) +/// * Some(("b.c.example.com", "/")) +/// * Some(("c.example.com", "/")) +/// * Some(("example.com", "/")) +/// * None +/// +/// (Can't put example code in rustdoc for private items unfortunately D:) +pub struct SanitizedUrlDestructureIter<'a> { + host: &'a str, + path: &'a str, + host_is_ip: bool, +} + +impl<'a> SanitizedUrlDestructureIter<'a> { + /// Create an instance of this iterator destructuring this URL. + #[must_use] + pub fn new(url: &'a SanitizedUrl) -> Self { + let host_is_ip = url.is_host_an_ip_address(); + Self::from_host_and_path(url.host_str(), url.path(), host_is_ip) + } + + /// Create the iterator manually given a host and a path, as well as whether or not the host is + /// an IP address or not. + /// + /// # Panics + /// + /// Panics if the input path does not start with a "/". + #[must_use] + pub fn from_host_and_path(host: &'a str, path: &'a str, host_is_ip: bool) -> Self { + assert!(path.starts_with('/'), "Path must always start with a /"); + Self { + host, + path, + host_is_ip, + } + } +} + +impl<'a> Iterator for SanitizedUrlDestructureIter<'a> { + type Item = (&'a str, &'a str); + fn next(&mut self) -> Option { + if self.path.len() > 1 { + let output = (self.host, self.path); + + (self.path, _) = self + .path + .rsplit_once('/') + .expect("Path is guaranteed to always start with /"); + + return Some(output); + } + + // Path is empty; reduce host. + + if self.host_is_ip { + // Host is an IP. Reduce and return that if we have it, otherwise bail. + if self.host.is_empty() { + None + } else { + let output = (self.host, "/"); + self.host = ""; + Some(output) + } + } else if let Some((_subdomain_to_shed, rest_of_host)) = self.host.split_once('.') { + let output = (self.host, "/"); + self.host = rest_of_host; + Some(output) + } else { + None + } + } +} + +#[cfg(test)] +mod tests { + #![allow(clippy::unwrap_used)] + use super::*; + + /// Mostly just to note this for my own sanity lol + #[test] + fn url_crate_does_not_sanitize_percent_encoding() { + let url = Url::parse("http://%68ello/%68ello?%68ello=%68ello").unwrap(); + assert_ne!(url.as_str(), "http://hello/hello?hello=hello"); + } + #[test] + fn normalize_is_idempotent() { + // Not sure of the best way to test this, but here goes. + let initial = "%252525%25%2525%25%25%25%25252525"; + let mut result = normalize_new_string(initial); + assert_eq!(result, "%252525%25%2525%25%25%25%25252525"); + result = normalize_new_string(&result); + assert_eq!(result, "%252525%25%2525%25%25%25%25252525"); + } + + #[test] + fn general_test_idk() { + // Note: during query parameter parsing, + itself is considered to mean whitespace. + let url: SanitizedUrl = "ftp://AMOGUS:AMOGUS@EXAMPLE.com:6969/lol/wat?1+%31=%32&AMONG#us" + .parse() + .unwrap(); + assert_eq!(url.as_str(), "https://example.com/lol/wat?1%201=2&among"); + + let url = Url::parse("https://example.com/woot/").unwrap(); + assert_eq!( + SanitizedUrl::new(url).unwrap().as_str(), + "https://example.com/woot" + ); + } + + #[test] + fn telegram_test() { + let expected_sanitized = "https://t.me/architector_4"; + + let url: SanitizedUrl = "t.me/Architector_4/?amogus#amogus".parse().unwrap(); + assert_eq!(url.as_str(), expected_sanitized); + + let url: SanitizedUrl = "http://Architector_4.t.me/".parse().unwrap(); + assert_eq!(url.as_str(), expected_sanitized); + + let url: SanitizedUrl = "telegram.dog".parse().unwrap(); + assert_eq!(url.as_str(), "https://t.me/"); + + let url: SanitizedUrl = "https://telegram.dog/Architector_4/amogus/amogus" + .parse() + .unwrap(); + assert_eq!(url.as_str(), expected_sanitized); + + let url: SanitizedUrl = "https://foo.bar.amogus.t.me/".parse().unwrap(); + assert_eq!(url.as_str(), "https://t.me/foo.bar.amogus"); + } + + #[test] + fn youtube_test() { + let expected_sanitized = "https://youtube.com/watch?v=dqw4w9wgxcq"; + + let url: SanitizedUrl = "https://www.youtube.com/watch?t=22&v=dQw4w9WgXcQ" + .parse() + .unwrap(); + assert_eq!(url.as_str(), expected_sanitized); + + let url: SanitizedUrl = "https://www.m.youtube.com/watch?v=dQw4w9WgXcQ&t=22" + .parse() + .unwrap(); + assert_eq!(url.as_str(), expected_sanitized); + + let url: SanitizedUrl = "https://youtu.be/dQw4w9WgXcQ?t=69420".parse().unwrap(); + assert_eq!(url.as_str(), expected_sanitized); + + // This video isn't a Shorts, but the idea is the same. + let url: SanitizedUrl = "https://www.youtube.com/shorts/dQw4w9WgXcQ" + .parse() + .unwrap(); + assert_eq!(url.as_str(), expected_sanitized); + } + + #[test] + fn twitter_test() { + let expected_sanitized = "https://twitter.com/i/status/1668313119301718016"; + let url: SanitizedUrl = + "https://www.x.com/rejectHisDesign/status/1668313119301718016?blahblah" + .parse() + .unwrap(); + assert_eq!(url.as_str(), expected_sanitized); + + let url: SanitizedUrl = + "https://www.twitter.com/rejectHisDesign/status/1668313119301718016?blahblah" + .parse() + .unwrap(); + assert_eq!(url.as_str(), expected_sanitized); + + let url: SanitizedUrl = ( + "https://www.stupidpenisx.com/rejectHisDesign/status/1668313119301718016?tracking=shit" + ).parse() + .unwrap(); + assert_eq!(url.as_str(), expected_sanitized); + } + + #[test] + fn signal_test() { + let url: SanitizedUrl = "https://signal.group/#wasdWASD".parse().unwrap(); + + assert_eq!(url.as_str(), "https://signal.group/fragment__wasdwasd"); + } + + #[test] + fn destructure_test() { + let url: SanitizedUrl = "https://a.b.c.example.com/some/funky/path?ignored¶ms" + .parse() + .unwrap(); + let mut destructure = url.destructure(); + assert_eq!( + destructure.next(), + Some(("a.b.c.example.com", "/some/funky/path")) + ); + + assert_eq!( + destructure.next(), + Some(("a.b.c.example.com", "/some/funky")) + ); + + assert_eq!(destructure.next(), Some(("a.b.c.example.com", "/some"))); + + assert_eq!(destructure.next(), Some(("a.b.c.example.com", "/"))); + + assert_eq!(destructure.next(), Some(("b.c.example.com", "/"))); + + assert_eq!(destructure.next(), Some(("c.example.com", "/"))); + + assert_eq!(destructure.next(), Some(("example.com", "/"))); + + assert_eq!(destructure.next(), None); + + assert_eq!(destructure.next(), None); + } + + #[test] + fn destructure_test_with_nothing_to_do() { + let url: SanitizedUrl = "https://amogus.com/".parse().unwrap(); + let mut destructure = url.destructure(); + assert_eq!(destructure.next(), Some(("amogus.com", "/"))); + + assert_eq!(destructure.next(), None); + + assert_eq!(destructure.next(), None); + } + + #[test] + fn destructure_to_number() { + let url: SanitizedUrl = "https://a.b.c.example.com/some/funky/path?ignored¶ms" + .parse() + .unwrap(); + + assert_eq!(url.destructure_to_number(0).unwrap(), url); + assert_eq!( + url.destructure_to_number(1).unwrap().as_str(), + "https://a.b.c.example.com/some/funky/path" + ); + assert_eq!( + url.destructure_to_number(2).unwrap().as_str(), + "https://a.b.c.example.com/some/funky" + ); + assert_eq!( + url.destructure_to_number(3).unwrap().as_str(), + "https://a.b.c.example.com/some" + ); + assert_eq!( + url.destructure_to_number(4).unwrap().as_str(), + "https://a.b.c.example.com/" + ); + assert_eq!( + url.destructure_to_number(5).unwrap().as_str(), + "https://b.c.example.com/" + ); + assert_eq!( + url.destructure_to_number(6).unwrap().as_str(), + "https://c.example.com/" + ); + assert_eq!( + url.destructure_to_number(7).unwrap().as_str(), + "https://example.com/" + ); + assert_eq!(url.destructure_to_number(8), None); + } +} diff --git a/anti_nft_spam_bot/src/spam_checker/american_groundhog_spam.rs b/anti_nft_spam_bot/src/spam_checker/american_groundhog_spam.rs deleted file mode 100644 index b9c69a5..0000000 --- a/anti_nft_spam_bot/src/spam_checker/american_groundhog_spam.rs +++ /dev/null @@ -1,70 +0,0 @@ -/// Returns true if the provided HTML is from a spam Telegram invite link URL -/// spread by American Groundhog spammers, or false if it's not known. -/// -/// This function does not check if the passed HTML is actually from Telegram, -/// so don't use it for pages that aren't. -pub fn check_spam_telegram_html(html: &str) -> bool { - if html.contains("American groundhog 🇺🇸") { - // buh-bye! - return true; - } - - if html.contains("WikiLeaks") - && html.contains("We are here to bring you the truth") - { - return true; - } - - if html.contains("Memento") - && html.contains("Uncover hidden truths, decode mysteries") - { - return true; - } - - if html.contains("X Leaks") { - return true; - } - - // Can't see anything of note. - false -} - -#[cfg(test)] -mod tests { - // too lazy to fix these tests right now lmao - - //use super::super::{visit_and_check_if_spam, IsSpamCheckResult}; - //use super::*; - - //async fn check_url(bad_url: &'static str) { - // let bad_url = Url::parse(bad_url).unwrap(); - // assert_eq!( - // visit_and_check_if_spam(&bad_url).await.unwrap(), - // IsSpamCheckResult::YesUrl, - // "failed on {}", - // bad_url - // ); - //} - - // Telegram started blocking showing info on theHTTPS accessible - // description of the invite link, making this test fail. - // Oh well, it works based on the previous 100 times this test was run lmao - //#[tokio::test] - //async fn detect_american_groundhog() { - // check_url("https://telegra.ph/JOE-BIDEN-OFFICIALLY-SIGNS-THE-TIKTOK-BAN-BUT-YOU-DONT-KNOW-THE-REAL-REASON-FOR-IT-04-24").await; - //} - - //#[tokio::test] - //async fn detect_memento() { - // check_url("https://telegra.ph/Simpsons2024LIVE-04-18").await - // // Literally the same thing but with a different date: - // // https://telegra.ph/2-out-of-3-Simpsons-Predictions-in-BANNED-Episode-Come-True-Third-One-Targeting-Donald-Trump-Expected-for-April-30-04-29 - //} - - // Same issue as above. - //#[tokio::test] - //async fn detect_wikileaks() { - // check_url("https://telegra.ph/Sex-Trafficking-Ring-Organized-By-Famous-People-03-31").await; - // check_url("https://telegra.ph/No-Way-He-Did-That-05-28").await; - //} -} diff --git a/anti_nft_spam_bot/src/spam_checker/mod.rs b/anti_nft_spam_bot/src/spam_checker/mod.rs index 32dfdc2..f7ac1af 100644 --- a/anti_nft_spam_bot/src/spam_checker/mod.rs +++ b/anti_nft_spam_bot/src/spam_checker/mod.rs @@ -1,399 +1,15 @@ -use std::{collections::HashSet, sync::Arc, time::Duration}; -use url::Url; +use crate::{database::Database, sanitized_url::SanitizedUrl, types::UrlDesignation}; -use crate::{ - database::Database, - types::{Domain, IsSpam}, -}; - -/////// IMPORTANT!! -/////// IMPORTANT!! -/////// IMPORTANT!! -/////// If spam checking logic is updated to catch more spam, increment this. -pub const SPAM_CHECKER_VERSION: u32 = 5; - -// Checkers -mod american_groundhog_spam; -mod nft_spam; - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -enum IsSpamCheckResult { - No, - YesUrl, - YesDomain, - Maybe, -} - -impl From for IsSpam { - fn from(val: IsSpamCheckResult) -> Self { - match val { - IsSpamCheckResult::No => IsSpam::No, - IsSpamCheckResult::YesUrl | IsSpamCheckResult::YesDomain => IsSpam::Yes, - IsSpamCheckResult::Maybe => IsSpam::Maybe, - } - } -} - -/// Check the link's domain against the database, or by visiting, as needed. +/// # Panics /// -/// Returns the check's result, and whether or not it's from a database. -/// -/// Returns [`None`] if both checking methods failed. -pub fn check<'a>( - database: &'a Arc, - domain: &'a Domain, - url: &'a Url, -) -> impl std::future::Future> + 'a { - check_inner(database, domain, url, 0) -} - -async fn check_inner( - database: &Arc, - domain: &Domain, - url: &Url, - recursion_depth: u8, -) -> Option<(IsSpam, bool)> { - // Check the database... - let db_result = database - .is_spam(url, Some(domain), false) - .await - .expect("Database died!"); - - log::debug!( - concat!( - "Checked {} with database (recursion {}) and got: {:?}\n", - "(second flag is true if manually reviewed)" - ), - url, - recursion_depth, - db_result - ); - - if recursion_depth > 1 { - log::debug!("Recursion level in checker reached..."); - return None; - } - - if let Some((result, true)) = db_result { - // Manually reviewed. Go ahead. - return Some((result, true)); - } - - // We now know it's not manually reviewed. Discard that flag. - let db_result = db_result.map(|x| x.0); - - if let Some(IsSpam::Yes) = db_result { - // Confirmed spam. Just return. - return Some((IsSpam::Yes, true)); - } - - if let Some(db_result) = db_result { - // It's marked as not spam or maybe spam. - // Was this manually reviewed? - - // Is this specifically for this URL, or just the general domain result? - if let Some(db_result_for_url) = database - .is_url_spam(url, false) - .await - .expect("Database died!") - { - log::debug!( - "Checked {url} URL specifically with database and got: {db_result_for_url:?}" - ); - return Some((db_result_for_url.0, true)); - } - - // No result for the URL specifically, but we are in this branch. - // This means `db_result` contains the result for the domain. - - // Assumption: if a domain is marked as not spam or maybe spam, - // and a URL is just the domain without a path, then the domain's - // result is accurate for that specific URL too. - - // URL crate's "empty path" seems to be just the slash, - // but also check for emptystring in case this isn't always true. - if url.path() == "/" || url.path().is_empty() { - return Some((db_result, true)); - } - } - - let mut url_maybe_spam = false; - - // All stuff above did not answer anything. Vibe check just the link... - - if let Some(url_looks_like_spam) = check_url_by_its_looks(url) { - // Add it to the database. - log::debug!("Checked if URL {url} looks like a spam URL and got: {url_looks_like_spam:?}"); - - match url_looks_like_spam { - IsSpam::Yes => { - database - .add_url(url, url_looks_like_spam, false) - .await - .expect("Database died!"); - return Some((url_looks_like_spam, false)); - } - // In case it's maybe spam or not spam, still check it properly. - IsSpam::Maybe => url_maybe_spam = true, - IsSpam::No => (), - } +/// Panics if the database dies lol +pub async fn is_url_spam(database: &Database, url: &SanitizedUrl) -> bool { + if let Some(info) = database.get_url(url).await.expect("Database died!") { + return info.designation() == UrlDesignation::Spam; } - log::debug!("{url} Is not in the database. Debouncing..."); - let mut visit_guard = None; - let has_visit_guard = if recursion_depth == 0 { - visit_guard = database.domain_visit_debounce(domain.clone()).await; - visit_guard.is_some() - } else { - true - }; - - if has_visit_guard { - match visit_and_check_if_spam(database, domain, url, recursion_depth).await { - Ok(mut is_spam_check) => { - // Add it to the database. - log::debug!("Visited {url} and got: {is_spam_check:?}"); - database - .add_url(url, is_spam_check.into(), false) - .await - .expect("Database died!"); - // All the other cases effectively apply to the domains too... - if is_spam_check != IsSpamCheckResult::YesUrl { - if is_spam_check == IsSpamCheckResult::No && url_maybe_spam { - is_spam_check = IsSpamCheckResult::Maybe; - } - - database - .add_domain(domain, url, is_spam_check.into(), false) - .await - .expect("Database died!"); - } - - Some((is_spam_check.into(), false)) - } - _ => { - // The visit probably timed out or something. Meh. - log::debug!("{url} timed out"); - None - } - } - } else { - log::debug!("{url} was just visited. Trying the database."); - // Oh no nevermind, someone else visited it. - // Just get the database result. - drop(visit_guard); - database - .is_spam(url, domain, false) - .await - .expect("Database died!") - .map(|x| (x.0, true)) - } -} - -fn get_reqwest_client(use_proxy: bool) -> Result { - use reqwest::*; - use std::fs::*; - use std::io::{BufRead, BufReader}; - use std::net::*; - - // Default policy is to follow up to 10 redirects. - // And yeah, I'm using a "real browser" user agent. Sorgy. At least I'm not spamming - // requests like a scraper or something. Need this to get around some CloudFlare - // captchas lol - let mut client = Client::builder() - .user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36") - .timeout(Duration::from_secs(7)) - .connect_timeout(Duration::from_secs(7)) - // Force IPv4 because a proxy of mine doesn't support it lol - // https://github.com/seanmonstar/reqwest/issues/584 - .local_address(IpAddr::V4(Ipv4Addr::UNSPECIFIED)); - - if use_proxy { - if let Ok(proxies) = File::open("proxies.txt").map(|x| BufReader::new(x).lines()) { - for line in proxies { - match line { - Ok(line) => { - client = client.proxy(Proxy::all(line.trim())?); - } - Err(_) => break, - } - } - } - } - - client.build() -} - -/// Check if a website served by the given URL is spam or not by visiting it. -async fn visit_and_check_if_spam( - database: &Arc, - domain: &Domain, - url: &Url, - recursion_depth: u8, -) -> Result { - let mut client = get_reqwest_client(true)?; - - let result = match client.get(url.as_str()).send().await { - Ok(x) => x, - Err(e) => { - // Try without proxy? - client = get_reqwest_client(false)?; - let result = client.get(url.as_str()).send().await?; - // The ? in the line above is intentional. - // If we got here, that means connecting without proxy succeeded. - // Warn only in that case to not spam up logs due to generally invalid URLs. - log::warn!("Proxy failed, but normal request didn't:\n{e:?}"); - result - } - }; - - if result.url() != url { - // We have been redirected. Check where we ended up with the database. - if let Some(db_result) = database - .is_spam(result.url(), None, false) - .await - .expect("Database died!") - { - let response = match db_result.0 { - IsSpam::No => IsSpamCheckResult::No, - IsSpam::Yes => { - // The "Yes" answer may be for the domain of the new URL as a whole. - // However, this does not necessarily villify *this* whole domain. - IsSpamCheckResult::YesUrl - } - IsSpam::Maybe => IsSpamCheckResult::Maybe, - }; - - return Ok(response); - } - } - - // Gather some specifics relevant to cloudflare captchas... - let header_powered_by = result.headers().get("x-powered-by").is_some(); - let header_cf_ray = result.headers().get("cf-ray").is_some(); - let header_cache = result.headers().get("cf-cache-status").is_some(); - let status_code_forbidden = result.status() == reqwest::StatusCode::FORBIDDEN; - - let text = result.text().await?; - - if (text.contains("Just a moment...") - && text.contains("Enable JavaScript and cookies to continue")) - || text.contains("Attention Required! | Cloudflare") - || (text.contains("cloudflare") && text.contains("erify that you are a human")) - { - // Cloudflare captcha. - - // Check validity of it being a *real* cloudflare captcha. - if status_code_forbidden && !header_powered_by && !header_cache && header_cf_ray { - // It's a captcha. Bleh. If it's spam, users will let us know with /spam. - - log::debug!("Got CloudFlare captcha on URL {url}"); - return Ok(IsSpamCheckResult::No); - } - - // Fake cloudflare captcha. - // Can't believe we got lied to. So sad :( - - return Ok(IsSpamCheckResult::YesUrl); - } - - if domain.as_str().eq_ignore_ascii_case("telegra.ph") - || domain.as_str().eq_ignore_ascii_case("teletype.in") - { - // If it's telegra.ph, do some extra funny checks. - // Find links here and figure if they're spam themselves. - - let mut matches: HashSet = HashSet::with_capacity(20); - let mut html: &str = &text; - let mut current_consensus = IsSpamCheckResult::No; - - // Limit this to 20 matches - while matches.len() < 20 { - let Some(link_start) = html.find("http") else { - break; - }; - - let mut a_match = &html[link_start..]; - - let link_length = a_match.find('"').unwrap_or(a_match.len()); - - a_match = &a_match[..link_length]; - - // We found a potential link. Add it to our collection. - if let Ok(new_url) = Url::parse(a_match) { - if &new_url != url { - matches.insert(new_url); - } - } - // Advance html forward so we don't match on this same thing. - html = &html[link_start + link_length..]; - } - - log::debug!( - "RECURSING #{} on {} with {} links...", - recursion_depth, - url, - matches.len() - ); - - let mut iter = matches.iter().peekable(); - - // Now check each of those links. - while let Some(a_match) = iter.next() { - let Some(match_domain) = Domain::from_url(a_match) else { - continue; - }; - // We don't care if this is from DB or not here lol - if let Some((x, _)) = Box::pin(check_inner( - database, - &match_domain, - a_match, - recursion_depth + 1, - )) - .await - { - match x { - IsSpam::No => (), - IsSpam::Yes => return Ok(IsSpamCheckResult::YesUrl), - IsSpam::Maybe => current_consensus = IsSpamCheckResult::Maybe, - } - } - - // Sleep for a bit, so we don't hammer telegram in case there's multiple links. - if iter.peek().is_some() { - tokio::time::sleep(std::time::Duration::from_millis(300)).await; - } - } - - // Checked a telegra.ph link. Return results on that. - return Ok(current_consensus); - } - - // Check the HTML... - if nft_spam::is_spam_html(&text) { - return Ok(IsSpamCheckResult::YesDomain); - } - - if is_telegram_url(url) && american_groundhog_spam::check_spam_telegram_html(&text) { - return Ok(IsSpamCheckResult::YesUrl); - } - - // guess not. - Ok(IsSpamCheckResult::No) -} - -/// Check if this URL, just on its own, looks like spam. -pub fn check_url_by_its_looks(url: &Url) -> Option { - nft_spam::is_spam_telegram_url(url) -} - -/// Returns true if this URL's domain is Telegram. -pub fn is_telegram_url(url: &Url) -> bool { - let Some(domain) = url.domain() else { - return false; - }; + // No entry in the database found. + // TODO: automatic checking. - domain.eq_ignore_ascii_case("t.me") - || domain.eq_ignore_ascii_case("telegram.me") - || domain.eq_ignore_ascii_case("telegram.dog") + false } diff --git a/anti_nft_spam_bot/src/spam_checker/nft_spam.rs b/anti_nft_spam_bot/src/spam_checker/nft_spam.rs deleted file mode 100644 index 2f05999..0000000 --- a/anti_nft_spam_bot/src/spam_checker/nft_spam.rs +++ /dev/null @@ -1,168 +0,0 @@ -use url::Url; - -use crate::types::IsSpam; - -pub fn is_spam_html(text: &str) -> bool { - text.contains("cdnjs.cloudflare.com/ajax/libs/ethers") - || text.contains("ethereumjs") - || text.contains("web3.min.js") -} - -/// Returns `None` if it's not a telegram URL. -/// Returns `Some(IsSpam::No)` if it's not recognized as spam -/// by this function. -pub fn is_spam_telegram_url(url: &Url) -> Option { - if !super::is_telegram_url(url) { - return None; - } - - // Ripping out Url::path_segments() body here lol - let Some(path) = url.path().strip_prefix('/') else { - // Shouldn't happen but eh - return Some(IsSpam::No); - }; - - let path_lower = path.to_lowercase(); - let mut segments = path_lower.split('/'); - - let Some(username) = segments.next() else { - // Someone just linked t.me? lol - return Some(IsSpam::No); - }; - - if username == "blum" || username == "blumcryptobot" { - // Annoying crypto spam that's a telegram bot but - // also has a username without "bot" at the end. - return Some(IsSpam::Yes); - } - - if username == "notpixel" { - // Same as above. - return Some(IsSpam::Yes); - } - - if username == "models3dprint" { - // Not "NFT" spam per se, but those get spammed quite a lot in at least one chat and any - // query to them was only responded by automated bots. Might be a scam too. - return Some(IsSpam::Yes); - } - - if username.starts_with("tgh") { - // If more than 3 digits at the end... - // (if position of first non-digit character from the end - // is bigger than the third counting from 0...) - if username - .chars() - .rev() - .position(|x| !x.is_ascii_digit()) - .is_some_and(|x| x > 2) - { - // Some new spam has been flooding all over with usernames - // like "@TGHfocus25932" and such. - return Some(IsSpam::Yes); - } - } - - if !username.ends_with("bot") { - // Not a telegram bot (usually). - return Some(IsSpam::No); - } - - if username.ends_with("hamster_kombat_bot") { - // Specific one that's being spammed a bunch. - return Some(IsSpam::Yes); - } - - if username.ends_with("gemgombot") { - return Some(IsSpam::Yes); - } - - if username.ends_with("drft_party_bot") { - return Some(IsSpam::Yes); - } - - if username.ends_with("drop_bot") { - // No way in hell a "...drop_bot" is anything other than spam, right? - return Some(IsSpam::Yes); - } - - let Some(params) = segments.next() else { - // It's a bot, but no params. Probably fine. - return Some(IsSpam::No); - }; - - // It has parameters... That's somewhat sus. - - if params.contains("claim") || params.contains("drop") { - // Who else would post a bot with params of "claim" than spammers anyway? - return Some(IsSpam::Yes); - } - - let Some(query) = url.query() else { - // Checks below check for the query parameters specifically - return Some(IsSpam::Maybe); - }; - - if query.contains("startapp=kentId") { - // Weird specificity of a bunch of "nft game telegram bot" spam links - return Some(IsSpam::Yes); - } - - if params.contains("game") && query.contains("ref=") { - // Some spam "nft game telegram bot" links use this type of params instead - return Some(IsSpam::Yes); - } - - Some(IsSpam::Maybe) -} - -#[cfg(test)] -mod tests { - //#[test] - //fn wat(){ - // let text = include_str!("/media/ext_hdd/nobackup/architector4/Downloads/spam.txt"); - // assert!(is_spam_html(text)); - //} - - use url::Url; - - use super::is_spam_telegram_url; - use crate::types::IsSpam; - - #[test] - fn test_spam_bot_url() { - let random_url = Url::parse("https://www.amogus.com/").unwrap(); - assert!(is_spam_telegram_url(&random_url).is_none()); - - let random_telegram_url = Url::parse("https://t.me/Architector_4_Channel").unwrap(); - assert!(matches!( - is_spam_telegram_url(&random_telegram_url), - Some(IsSpam::No) - )); - - let random_telegram_bot_url = Url::parse("https://t.me/Anti_NFT_Spam_Bot").unwrap(); - assert!(matches!( - is_spam_telegram_url(&random_telegram_bot_url), - Some(IsSpam::No) - )); - - let spam_url = Url::parse("https://t.me/FawunBot/claim").unwrap(); - assert!(matches!(is_spam_telegram_url(&spam_url), Some(IsSpam::Yes))); - - let spam_url = - Url::parse("https://t.me/stonksdrop_bot?start=bd658555-7bc6-4652-8afb-e69fdd3d4c0d") - .unwrap(); - assert!(matches!(is_spam_telegram_url(&spam_url), Some(IsSpam::Yes))); - - let spam_url = - Url::parse("https://t.me/hAmster_kombat_bot/start?startapp=kentId677635570").unwrap(); - assert!(matches!(is_spam_telegram_url(&spam_url), Some(IsSpam::Yes))); - - let spam_url = Url::parse("http://t.me/trumpton_bot/game?ref=129383dHJJS").unwrap(); - assert!(matches!(is_spam_telegram_url(&spam_url), Some(IsSpam::Yes))); - - let spam_url = - Url::parse("https://t.me/notpixel/app?startapp=f6983374587_s573790").unwrap(); - assert!(matches!(is_spam_telegram_url(&spam_url), Some(IsSpam::Yes))); - } -} diff --git a/anti_nft_spam_bot/src/types.rs b/anti_nft_spam_bot/src/types.rs index a9b543b..67b3c99 100644 --- a/anti_nft_spam_bot/src/types.rs +++ b/anti_nft_spam_bot/src/types.rs @@ -1,306 +1,311 @@ use std::fmt::Display; -use sqlx::Error; -use teloxide::types::Message; +use teloxide::types::{InlineKeyboardButton, InlineKeyboardMarkup}; use url::Url; -use crate::{ - database::{self, Database}, - parse_url_like_telegram, -}; +use super::sanitized_url::SanitizedUrl; -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum IsSpam { - No = 0, - Yes = 1, - Maybe = 2, -} - -impl IsSpam { - /// Picks the option that is most condemning, - /// along with a boolean that is true if `b` was picked, false otherwise. - pub fn pick_most_condemning(a: Option, b: Option) -> Option<(Self, bool)> { - match (a, b) { - (Some(Self::Yes), _) => Some((Self::Yes, false)), - (_, Some(Self::Yes)) => Some((Self::Yes, true)), - (Some(Self::Maybe), _) => Some((Self::Maybe, false)), - (_, Some(Self::Maybe)) => Some((Self::Maybe, true)), - (_, Some(Self::No)) => Some((Self::No, true)), - (Some(Self::No), _) => Some((Self::No, false)), - (None, _) => None, - } - } +/// A designation for a URL. Designates how to treat it. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +#[repr(u8)] +pub enum UrlDesignation { + /// This URL is not spam. Ignore it and anything under it in automatic checking. But, if anyone + /// indicates that a URL matching this rule is spam, and it wasn't manually reviewed to be not + /// spam, send it for review. + /// + /// So, if `example.com/abc` is designated as this, then `example.com/abc/def` is also not spam. + NotSpam = 0, + /// This URL is an aggregator. It is not spam, but some things under it may or may not be spam. + /// + /// So, if `example.com/abc` is designated as this, then `example.com/abc/def` needs separate + /// checking to be determined if it's spam or not. + Aggregator = 1, + /// This URL is spam. Everything under it is spam too. + /// + /// So, if `example.com/abc` is designated as this, then `example.com/abc/def` is also spam. + Spam = 2, } -impl From for IsSpam { - fn from(value: u8) -> Self { - use IsSpam::*; +impl TryFrom for UrlDesignation { + type Error = (); + fn try_from(value: u8) -> Result { match value { - value if value == No as u8 => No, - value if value == Yes as u8 => Yes, - value if value == Maybe as u8 => Maybe, - _ => panic!("Unknown value: {value}"), + x if x == UrlDesignation::NotSpam as u8 => Ok(UrlDesignation::NotSpam), + x if x == UrlDesignation::Aggregator as u8 => Ok(UrlDesignation::Aggregator), + x if x == UrlDesignation::Spam as u8 => Ok(UrlDesignation::Spam), + _ => Err(()), } } } -impl From for u8 { - fn from(value: IsSpam) -> Self { - value as u8 - } -} - -/// A single domain name. -#[derive(Debug, Clone, Hash, PartialEq, Eq)] -pub struct Domain(String); - -impl Domain { - pub fn from_url(url: &Url) -> Option { - url.domain().map(|x| Self(x.to_lowercase())) - } - /// Convenience function to try and parse a string directly to a domain name. - #[allow(unused)] - pub fn from_str(string: &str) -> Option { - parse_url_like_telegram(string) - .ok() - .as_ref() - .and_then(Self::from_url) - } - - pub fn as_str(&self) -> &str { - self.as_ref() - } +impl Display for UrlDesignation { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let tha_string = match self { + Self::NotSpam => "Not spam", + Self::Aggregator => "Aggregator", + Self::Spam => "Spam", + }; - pub(crate) fn new_invalid_unchecked() -> Domain { - Domain(String::new()) + f.write_str(tha_string) } } -impl AsRef for Domain { - fn as_ref(&self) -> &str { - self.0.as_ref() - } +/// Review callback data. As per telegram API for callback queries, must be serialized to below 64 +/// bytes. +#[derive(Clone, Copy, Debug)] +pub struct ReviewCallbackData { + /// ID of the URL in the review queue. + pub review_entry_id: i64, + /// CRC32 hash of the URL in the review queue. + pub url_crc32: u32, + /// New designation in the review. + pub designation: UrlDesignation, + /// How many times the URL in this review queue should be destructured before applying this + /// review. Use as argument to [`SanitizedUrl::destructure_to_number`]. + pub destructure: u64, } -impl Display for Domain { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - Display::fmt(&self.0, f) +impl ReviewCallbackData { + // Manually written serialize/deserialize. + // Doing it with derived traits with Serde and JSON or whatever would be more "proper", + // but that typically involves allocation (can't beat a constant-size array), + // and the serialized result is always over 64 bytes, which isn't allowed. + // + // I could probably still squeeze it into JSON with a bunch of effort, but this is easier lol + + /// Serialize this data. Output is valid ASCII (i.e. valid UTF-8). + #[must_use] + pub fn serialize(&self) -> [u8; 46] { + let mut output = [0u8; 46]; + output[0] = b'I'; + hex::encode_to_slice(self.review_entry_id.to_le_bytes(), &mut output[1..17]) + .expect("Slice size guaranteed to be valid"); + output[17] = b'C'; + hex::encode_to_slice(self.url_crc32.to_le_bytes(), &mut output[18..26]) + .expect("Slice size guaranteed to be valid"); + output[26] = b'D'; + hex::encode_to_slice((self.designation as u8).to_le_bytes(), &mut output[27..29]) + .expect("Slice size guaranteed to be valid"); + output[29] = b'S'; + hex::encode_to_slice(self.destructure.to_le_bytes(), &mut output[30..46]) + .expect("Slice size guaranteed to be valid"); + + output } -} -#[derive(Debug)] -pub enum ReviewResponse { - UrlSpam(Option, Url), - DomainSpam(Domain, Url), - NotSpam(Option, Url), - Skip, -} - -impl ReviewResponse { - /// True if this response marks something as spam. - #[allow(dead_code)] - pub fn marks_as_spam(&self) -> bool { - match self { - ReviewResponse::Skip => false, - ReviewResponse::UrlSpam(_, _) => true, - ReviewResponse::DomainSpam(_, _) => true, - ReviewResponse::NotSpam(_, _) => false, + /// Deserialize data. Should initially have been generated by [`Self::serialize`]. + #[must_use] + pub fn deserialize(input: &[u8; 46]) -> Option { + if !(input[0] == b'I' && input[17] == b'C' && input[26] == b'D' && input[29] == b'S') { + return None; } - } - #[allow(dead_code)] - pub fn as_ref_url_domain(&self) -> Option<(Option<&Domain>, &Url)> { - match self { - ReviewResponse::Skip => None, - ReviewResponse::UrlSpam(d, u) => Some((d.as_ref(), u)), - ReviewResponse::DomainSpam(d, u) => Some((Some(d), u)), - ReviewResponse::NotSpam(d, u) => Some((d.as_ref(), u)), - } + let mut scratch = [0u8; 8]; + + hex::decode_to_slice(&input[1..17], &mut scratch[0..8]) + .expect("Slice size guaranteed to be valid"); + let review_entry_id = i64::from_le_bytes(scratch); + + hex::decode_to_slice(&input[18..26], &mut scratch[0..4]) + .expect("Slice size guaranteed to be valid"); + let url_crc32 = u32::from_le_bytes( + scratch[0..4] + .try_into() + .expect("Slice size guaranteed to be valid"), + ); + + hex::decode_to_slice(&input[27..29], &mut scratch[0..1]) + .expect("Slice size guaranteed to be valid"); + let designation: UrlDesignation = scratch[0].try_into().ok()?; + + hex::decode_to_slice(&input[30..46], &mut scratch[0..8]) + .expect("Slice size guaranteed to be valid"); + let destructure = u64::from_le_bytes(scratch); + + Some(Self { + review_entry_id, + url_crc32, + designation, + destructure, + }) } - pub fn deconstruct(self) -> Option<(Option, Url)> { - match self { - ReviewResponse::Skip => None, - ReviewResponse::UrlSpam(d, u) => Some((d, u)), - ReviewResponse::DomainSpam(d, u) => Some((Some(d), u)), - ReviewResponse::NotSpam(d, u) => Some((d, u)), - } + /// Convenience wrapper around [`Self::serialize`] that then converts the result into a string. + #[must_use] + pub fn serialize_to_string(&self) -> String { + let serialized = self.serialize().to_vec(); + String::from_utf8(serialized) + .expect("Serialized review callback is guaranteed to be valid ASCII.") } - /// Returns true if ingesting this into the database - /// would cause a change that we are interested in. - pub async fn conflicts_with_db(&self, database: &Database) -> Result { - Ok(match self { - ReviewResponse::Skip => false, - ReviewResponse::UrlSpam(_, url) => database - .is_url_spam(url, false) - .await? - .is_none_or(|x| x.0 != IsSpam::Yes || !x.1), - ReviewResponse::DomainSpam(domain, _url) => database - .is_domain_spam(domain, false) - .await? - .is_none_or(|x| x.0 != IsSpam::Yes || !x.2), - ReviewResponse::NotSpam(domain, url) => database - .is_spam(url, domain.as_ref(), true) - .await? - .is_none_or(|x| x.0 != IsSpam::No || !x.1), - }) + /// Convenience wrapper around [`Self::deserialize`] that tries to deserialize from a string. + #[must_use] + pub fn deserialize_from_str(input: &str) -> Option { + let input: &[u8; 46] = input.as_bytes().try_into().ok()?; + Self::deserialize(input) } - /// Parse a string (callback query) into review responses, - /// with supplementary data. + /// Generates a keyboard with callback buttons for reviewing this URL with this review queue + /// entry ID. /// - /// All responses returned from one call of this function - /// are guaranteed to be of the same type. - pub async fn from_str( - value: &str, - database: &Database, - message: Option<&Message>, - ) -> Result, Box> { - let mut iter = value.split_ascii_whitespace(); - let action = iter.next().ok_or("Empty response")?; - - if action == "SKIP" { - if iter.next().is_some() { - Err("Extraneous data in response")?; - } - return Ok(vec![ReviewResponse::Skip]); - } - - let table = iter.next().ok_or("No table name")?; - - // May be a "figure it out yourself" review response. Figure that out. - if table == "derive" { - let Some(message) = message else { - Err("Message too old")? + /// If `destructure_down_to` is provided, buttons are only generated down to that. + #[must_use] + pub fn callback_buttons_from_url( + review_entry_id: i64, + url: &SanitizedUrl, + destructure_down_to: Option<&SanitizedUrl>, + ) -> InlineKeyboardMarkup { + fn format_as_spam_text(a: &str, trim_from_start: bool) -> String { + static LENGTH: usize = 25; + + let start = if trim_from_start { + a.char_indices().nth_back(LENGTH).unwrap_or((0, 'g')).0 + } else { + 0 }; - - let Some(text) = message.text() else { - Err("Message has no text???")? + let ellipsis_start = if start > 0 { "…" } else { "" }; + let end = if trim_from_start { + a.len() + } else { + a.char_indices().nth(LENGTH).unwrap_or((a.len(), 'g')).0 }; + let ellipsis_end = if end < a.len() { "…" } else { "" }; - let mut responses = Vec::new(); - for line in text.lines() { - if let Some(url_text) = line.strip_prefix("URL: ") { - let url: Url = url_text.parse().map_err(|_| "Failed to parse URL")?; - - let response = match action { - "URL_SPAM" => ReviewResponse::UrlSpam(None, url), - "DOMAIN_SPAM" => { - let domain = Domain::from_url(&url) - .ok_or("Failed to extract domain from a URL")?; - - ReviewResponse::DomainSpam(domain, url) - } - "NOT_SPAM" => ReviewResponse::NotSpam(None, url), - //"SKIP" => ReviewResponse::Skip, // Was handled above - _ => Err("Unknown action type")?, - }; - - responses.push(response); - } - } - - return Ok(responses); + format!( + "⛔️ {ellipsis_start}{}{ellipsis_end}", + &a[start..end].trim_matches('.') + ) } - let rowid: i64 = iter - .next() - .ok_or("No rowid")? - .parse() - .map_err(|_| "Failed to parse rowid")?; + let url_crc32 = crc32fast::hash(url.as_str().as_bytes()); - let crc32hash: u32 = iter - .next() - .ok_or("No hash")? - .parse() - .map_err(|_| "Failed to parse hash")?; - - if iter.next().is_some() { - Err("Extraneous data in response")?; - } - - let Some((url, domain_from_db)) = - database.get_url_from_table_and_rowid(table, rowid).await? - else { - Err("Specified data is not in database")? - }; - - if crc32fast::hash(url.as_str().as_bytes()) != crc32hash { - Err("Hash does not match! Please mark with a command instead and press Skip.")?; + let mut output: Vec> = + vec![vec![InlineKeyboardButton::callback( + "✅ Not spam".to_string(), + ReviewCallbackData { + review_entry_id, + url_crc32, + designation: UrlDesignation::NotSpam, + destructure: 0, + } + .serialize_to_string(), + )]]; + + if url.query().is_some() { + output.push(vec![InlineKeyboardButton::callback( + format!("⛔️ {url}"), + ReviewCallbackData { + review_entry_id, + url_crc32, + designation: UrlDesignation::Spam, + destructure: 0, + } + .serialize_to_string(), + )]); } - let domain = match domain_from_db { - Some(d) => Ok(d), - None => Domain::from_url(&url).ok_or("Failed extracting domain from URL"), - }; - - let response = match action { - "URL_SPAM" => ReviewResponse::UrlSpam(domain.ok(), url), - "DOMAIN_SPAM" => ReviewResponse::DomainSpam(domain?, url), - "NOT_SPAM" => ReviewResponse::NotSpam(domain.ok(), url), - //"SKIP" => ReviewResponse::Skip, // Was handled above - _ => Err("Unknown action type")?, - }; + for (i, (host, path)) in url.destructure().enumerate() { + if let Some(down_to) = destructure_down_to { + if host == down_to.host_str() && path == down_to.path() { + break; + } + } - Ok(vec![response]) - } + let the_str = if path.len() > 1 { path } else { host }; + let trim_from_start = path.len() > 1; - /// If this review response would mark a protected domain as spam, this method returns a - /// reference to it, otherwise returns `None`. - /// - /// This function takes a mutable reference because it might fill out the domain field of - /// [`ReviewResponse::UrlSpam`] variant, if this is one. - pub async fn conflicts_with_protected_domains( - &mut self, - db: &Database, - ) -> Result, Error> { - let domain_to_check = match self { - ReviewResponse::DomainSpam(domain, _) => Some(domain), - ReviewResponse::UrlSpam(domain, url) => { - if url.path().is_empty() || url.path() == "/" { - // We're marking just the plain link to the domain as spam. We want to ensure that - // it's not protected for this too. - - // First fetch it out, if needed. - if domain.is_none() { - let domain_new = Domain::from_url(url); - *domain = domain_new; - } - - domain.as_mut() - } else { - None + output.push(vec![InlineKeyboardButton::callback( + format_as_spam_text(the_str, trim_from_start), + ReviewCallbackData { + review_entry_id, + url_crc32, + designation: UrlDesignation::Spam, + destructure: i as u64 + 1, } + .serialize_to_string(), + )]); + + if output.len() >= 50 { + // Maximum limit is 100 buttons iirc, but eh. + break; } - ReviewResponse::NotSpam(..) | ReviewResponse::Skip => None, - }; + } - if let Some(domain_to_check) = domain_to_check { - db.is_domain_protected(domain_to_check) - .await - .map(|x| x.then_some(&*domain_to_check)) - } else { - Ok(None) + InlineKeyboardMarkup { + inline_keyboard: output, } } + + /// Given a review queue entry ID, and the URL in that entry, produces text and buttons for a + /// review keyboard. Buttons are produced with [`Self::callback_buttons_from_url`]. + #[must_use] + pub fn produce_review_keyboard_text_buttons( + review_entry_id: i64, + sanitized_url: &SanitizedUrl, + original_url: &Url, + destructure_down_to: Option<&SanitizedUrl>, + ) -> (String, InlineKeyboardMarkup) { + let text = format!("REVIEW:\n\n{original_url}\n\nWhat is spam here?"); + let buttons = + Self::callback_buttons_from_url(review_entry_id, sanitized_url, destructure_down_to); + + (text, buttons) + } } -impl Display for ReviewResponse { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +/// Use like this: +/// +/// `format!("Deleted a message from someuser {}", delete_reason);` +#[derive(Clone, Copy, Debug)] +pub enum MessageDeleteReason { + /// Contains a spam link. + ContainsSpamLink, + /// It's in an album with a message that was deemed as spam. + OfAlbumWithSpamMessage, +} + +impl MessageDeleteReason { + /// Turn this into a string, or nothing if this reason shouldn't be printed at all. + /// + /// For a value like [`Self::ContainsSpamLink`] this returns "containing a spam link". + /// + /// For best grammar, it's recommended to prepend the output string with "Removed a message from + /// someuser " + /// + /// + #[must_use] + pub fn to_str(self) -> Option<&'static str> { match self { - ReviewResponse::Skip => write!(f, "Skip"), - ReviewResponse::UrlSpam(_, url) => write!(f, "URL is spam: {url}"), - ReviewResponse::DomainSpam(_, url) => write!(f, "Domain and URL is spam: {url}"), - ReviewResponse::NotSpam(_, url) => write!(f, "Neither domain nor URL is spam: {url}"), + Self::ContainsSpamLink => Some("containing a spam link"), + Self::OfAlbumWithSpamMessage => None, } } } -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub enum MarkSusResult { - Marked, - AlreadyMarkedSus, - AlreadyMarkedSpam, - ManuallyReviewedNotSpam, +#[cfg(test)] +mod tests { + #![allow(clippy::unwrap_used)] + use super::*; + + #[test] + fn callback_data_ser_de() { + #[allow(clippy::cast_possible_wrap)] + let data = ReviewCallbackData { + review_entry_id: 0xAAAAAAAAAAAAAAAAu64 as i64, + url_crc32: 0xBBBBBBBB, + designation: UrlDesignation::Aggregator, + destructure: 0xCCCCCCCCCCCCCCCC, + }; + + let serialized = data.serialize(); + + assert!(serialized.is_ascii()); + + let deserialized = ReviewCallbackData::deserialize(&serialized).unwrap(); + + assert_eq!(data.review_entry_id, deserialized.review_entry_id); + assert_eq!(data.url_crc32, deserialized.url_crc32); + assert_eq!(data.designation, deserialized.designation); + assert_eq!(data.destructure, deserialized.destructure); + } }