From a77eb52c564197b03a6b03a49fdddde3888cac39 Mon Sep 17 00:00:00 2001 From: Joshua Barretto Date: Tue, 29 Apr 2025 21:10:12 +0100 Subject: [PATCH] Add robots.txt support --- README.md | 11 +++-- src/main.rs | 112 +++++++++++++++++++++++++++---------------------- src/robots.txt | 2 + 3 files changed, 71 insertions(+), 54 deletions(-) create mode 100644 src/robots.txt diff --git a/README.md b/README.md index 4719319..8aee7cf 100644 --- a/README.md +++ b/README.md @@ -20,10 +20,15 @@ round --sock
| Bind to the given socket. Defaults to 0.0.0.0:3000. ``` -Deploy it in a docker environment. It's probably safe, but no reason to take chances. +Babble will search for a `robots.txt` file in the working directory to use. If it does not find one, it will use a +default one that denies everything. -If you want to be nice to crawlers that *actually abide by `robots.txt`*, perhaps add an entry to warn search engines -away from it. +Babble will periodically emit statistics into `stats.txt`, showing information about the worst-offending requesting +IPs. + +## Warning + +Deploy it in a docker environment. It's probably safe, but no reason to take chances. ## Usage terms diff --git a/src/main.rs b/src/main.rs index fedce03..55ab919 100644 --- a/src/main.rs +++ b/src/main.rs @@ -56,6 +56,7 @@ fn create_rng(seed_bytes: impl IntoIterator) -> Rng { const COUNT_FILE: &str = "count.txt"; const STATS_FILE: &str = "stats.txt"; +const ROBOTS_TXT: &str = "robots.txt"; const SLOW_CHUNK_SIZE: usize = 100; const SLOW_DURATION: Duration = Duration::from_millis(100); @@ -119,62 +120,70 @@ async fn main() { let (stats_tx, stats_rx) = flume::unbounded(); + let robots_txt = std::fs::read_to_string(ROBOTS_TXT) + .ok() + .unwrap_or_else(|| include_str!("robots.txt").to_string()); + let app = { let counter = counter.clone(); let stats_tx = stats_tx.clone(); - Router::new().route( - "/{id}", - get( - |Path(id): Path, - ConnectInfo(sock): ConnectInfo, - headers: HeaderMap| async move { - // Create a RNG for this path (deterministic, to simulate static pages) - let mut rng = create_rng(id.bytes()); + Router::new() + .route("/robots.txt", get(|| async move { robots_txt.clone() })) + .route( + "/{id}", + get( + |Path(id): Path, + ConnectInfo(sock): ConnectInfo, + headers: HeaderMap| async move { + // Create a RNG for this path (deterministic, to simulate static pages) + let mut rng = create_rng(id.bytes()); - let ip = headers - .get("X-Forwarded-For") - .and_then(|h| h.to_str().ok()) - .and_then(|h| h.split(',').next()) - .and_then(|s| s.trim().parse().ok()) - .unwrap_or_else(|| sock.ip()); - stats_tx.send(RequestStats { ip }).unwrap(); + let ip = headers + .get("X-Forwarded-For") + .and_then(|h| h.to_str().ok()) + .and_then(|h| h.split(',').next()) + .and_then(|s| s.trim().parse().ok()) + .unwrap_or_else(|| sock.ip()); + stats_tx.send(RequestStats { ip }).unwrap(); - // Count the request. Also doubles as the non-deterministic seed - let count = counter.fetch_add(1, Ordering::Relaxed); + // Count the request. Also doubles as the non-deterministic seed + let count = counter.fetch_add(1, Ordering::Relaxed); - // Create a RNG for this session (non-deterministic) - let mut session_rng = create_rng(count.to_le_bytes()); + // Create a RNG for this session (non-deterministic) + let mut session_rng = create_rng(count.to_le_bytes()); - // Artificially slow down connections as rudimentary DDoS protection, and to use up client resources - tokio::time::sleep(Duration::from_millis(session_rng.random_range(200..1000))) + // Artificially slow down connections as rudimentary DDoS protection, and to use up client resources + tokio::time::sleep(Duration::from_millis( + session_rng.random_range(200..1000), + )) .await; - // Choose a bullshit generator from our collection for this page - let generator = generators.choose(&mut rng).unwrap(); + // Choose a bullshit generator from our collection for this page + let generator = generators.choose(&mut rng).unwrap(); - let title = generator - .word_stream(rng.random_range(2..10), &mut rng.clone()) - .join(" "); + let title = generator + .word_stream(rng.random_range(2..10), &mut rng.clone()) + .join(" "); - let stats = format!("Served rubbish to {count} clients so far"); + let stats = format!("Served rubbish to {count} clients so far"); - let content = generator - .word_stream(rng.random_range(50..5_000), &mut rng.clone()) - .fold(String::new(), |mut content, word| { - // Small chance of every word becoming a link back into the void - if rng.random_bool(0.05) { - let url = generator.word_stream(3, &mut rng.clone()).join("-"); - content += &format!(" {}", url, word); - } else { - // Also, a chance for every word to end with a newline. This should probably be controlled by the generator. - content += if rng.random_bool(0.01) { ".
" } else { " " }; - content += &word - } - content - }); + let content = generator + .word_stream(rng.random_range(50..5_000), &mut rng.clone()) + .fold(String::new(), |mut content, word| { + // Small chance of every word becoming a link back into the void + if rng.random_bool(0.05) { + let url = generator.word_stream(3, &mut rng.clone()).join("-"); + content += &format!(" {}", url, word); + } else { + // Also, a chance for every word to end with a newline. This should probably be controlled by the generator. + content += if rng.random_bool(0.01) { ".
" } else { " " }; + content += &word + } + content + }); - let html = format!( - " + let html = format!( + " {title} @@ -187,15 +196,15 @@ async fn main() { " - ); + ); - SlowBody { - bytes: html.into(), - interval: interval(SLOW_DURATION), - } - }, - ), - ) + SlowBody { + bytes: html.into(), + interval: interval(SLOW_DURATION), + } + }, + ), + ) }; let mut interval = tokio::time::interval(Duration::from_secs(20)); @@ -221,6 +230,7 @@ async fn main() { .rev() .enumerate() .map(|(i, (ip, n))| format!("{:<4} | {:<4} | {}\n", i + 1, n, ip)) + .take(30) .collect::(); let _ = std::fs::write(STATS_FILE, &stats); } diff --git a/src/robots.txt b/src/robots.txt new file mode 100644 index 0000000..eb05362 --- /dev/null +++ b/src/robots.txt @@ -0,0 +1,2 @@ +User-agent: * +Disallow: