#!/bin/bash

set -u

LOGFILE="/var/log/apache2/access.log"
STATEFILE="/var/tmp/apache_log_monitor.offset"
WORKFILE="/var/tmp/apache_log_monitor.newlines"
BLOCKLOG="/var/log/apache2/blocked_ips.log"

# Score needed before block
THRESHOLD=10

# Burst threshold within one run (assuming every 5 minutes)
BURST_THRESHOLD=80

# Whitelist your own IPs / trusted networks / known good bots if desired
WHITELIST_REGEX='^(127\.0\.0\.1|::1)$'

# Known-good bot UAs to avoid blocking just because they contain "bot"
GOOD_BOT_UA_REGEX='(Googlebot|AdsBot-Google|GoogleOther|Bingbot|bingbot|Slurp|DuckDuckBot|Applebot|facebookexternalhit|Facebot|LinkedInBot)'

# Obvious bad tools / scripted clients
BAD_TOOL_UA_REGEX='(curl/|wget|python-requests|python-urllib|Go-http-client|libwww-perl|scrapy|httpclient|java/|okhttp|aiohttp|nikto|sqlmap|masscan|nmap|zgrab|curl$|python$)'

# Generic scraper/bot words
BAD_BOT_UA_REGEX='(bot|crawler|spider|scraper|harvester|scanner|checker|fetcher|parser|grabber|collector|monitor)'

# Suspicious path probes
SUSPICIOUS_PATH_REGEX='(.env([./_-]|$)|/env([./_-]|$)|.git(/|$)|wp-config|wp-admin|wp-json|/xmlrpc.php|/boaform|/cgi-bin/|/HNAP1|/.DS_Store|/docker-compose.yml|/docker-compose.yaml|/config(.|/|$)|/credentials(.|/|$)|/secrets?(.|/|$)|stripe(.|/|_|$)|/debug.log|/phpinfo.php|/vendor/|/storage/|/backup/|/old/|/tests?/|/graphql|/swagger|/actuator|/manager/html|/server-status|/application.properties|/application.yml|/parameters.yml|/.vscode/|/.aws/|/id_rsa|/phpMyAdmin|/pma)'

mkdir -p "$(dirname "$STATEFILE")"
touch "$BLOCKLOG"

if [ ! -f "$LOGFILE" ]; then
    echo "$(date '+%F %T') ERROR: log file not found: $LOGFILE" >> "$BLOCKLOG"
    exit 1
fi

CURRENT_SIZE=$(stat -c%s "$LOGFILE" 2>/dev/null)
if [ -z "${CURRENT_SIZE:-}" ]; then
    echo "$(date '+%F %T') ERROR: could not stat $LOGFILE" >> "$BLOCKLOG"
    exit 1
fi

LAST_OFFSET=0
if [ -f "$STATEFILE" ]; then
    LAST_OFFSET=$(cat "$STATEFILE" 2>/dev/null)
    [[ "$LAST_OFFSET" =~ ^[0-9]+$ ]] || LAST_OFFSET=0
fi

# Handle rotation/truncation
if [ "$CURRENT_SIZE" -lt "$LAST_OFFSET" ]; then
    LAST_OFFSET=0
fi

tail -c +"$((LAST_OFFSET + 1))" "$LOGFILE" > "$WORKFILE" 2>/dev/null || > "$WORKFILE"
echo "$CURRENT_SIZE" > "$STATEFILE"

if [ ! -s "$WORKFILE" ]; then
    rm -f "$WORKFILE"
    exit 0
fi

# Prefer ipset
USE_IPSET=0
if command -v ipset >/dev/null 2>&1; then
    USE_IPSET=1
    ipset list apacheblock >/dev/null 2>&1 || ipset create apacheblock hash:ip timeout 86400
    iptables -C INPUT -m set --match-set apacheblock src -j DROP >/dev/null 2>&1 || \
        iptables -I INPUT 1 -m set --match-set apacheblock src -j DROP
fi

awk \
  -v threshold="$THRESHOLD" \
  -v burst_threshold="$BURST_THRESHOLD" \
  -v whitelist="$WHITELIST_REGEX" \
  -v goodbot="$GOOD_BOT_UA_REGEX" \
  -v badtool="$BAD_TOOL_UA_REGEX" \
  -v badbot="$BAD_BOT_UA_REGEX" \
  -v suspicious="$SUSPICIOUS_PATH_REGEX" '
BEGIN {
    IGNORECASE = 1
}
{
    ip = $1
    if (ip ~ whitelist) next

    line = $0
    score = 0

    # Extract request path
    path = ""
    if (match(line, /"(GET|POST|HEAD|OPTIONS|PUT|DELETE) [^ ]+/, m)) {
        split(m[0], parts, " ")
        if (length(parts) >= 2) path = parts[2]
    }

    # Extract status
    status = ""
    if (match(line, /" [0-9][0-9][0-9] /, s)) {
        status = substr(s[0], 3, 3)
    }

    # Extract UA (last quoted field in combined log)
    n = split(line, q, "\"")
    ua = ""
    if (n >= 6) ua = q[n-1]

    reqs[ip]++
    if (path != "") uniq[ip SUBSEP path] = 1

    # Suspicious target paths
    if (path ~ suspicious) score += 5

    # Empty or missing UA
    if (ua == "" || ua == "-" || ua == "\"\"" ) score += 2

    # Attack tools
    if (ua ~ badtool) score += 5

    # Generic bot/scraper wording, unless known good bot
    if (ua ~ badbot && ua !~ goodbot) score += 3

    # HEAD/OPTIONS floods are often probing
    if (line ~ /"(HEAD|OPTIONS) /) score += 1

    # POST to suspicious locations
    if (line ~ /"POST / && path ~ suspicious) score += 4

    # Repeated failures add weight
    if (status ~ /^(400|401|403|404|405|444)$/) score += 1

    # Redirects to hostile targets still count
    if (status ~ /^(301|302)$/ && path ~ suspicious) score += 1

    scores[ip] += score
    lastline[ip] = line
}
END {
    for (ip in reqs) {
        unique_count = 0
        for (k in uniq) {
            split(k, a, SUBSEP)
            if (a[1] == ip) unique_count++
        }

        total = scores[ip]

        # Burst traffic in one 5-min window
        if (reqs[ip] >= burst_threshold) total += 8

        # Many distinct URLs in one short window is scanner-like
        if (unique_count >= 25) total += 6

        if (total >= threshold) {
            print ip "\t" total "\t" reqs[ip] "\t" unique_count "\t" lastline[ip]
        }
    }
}
' "$WORKFILE" | while IFS=$'\t' read -r ip score reqs unique_count sample; do
    [ -z "$ip" ] && continue

    ALREADY=0
    if [ "$USE_IPSET" -eq 1 ]; then
        ipset test apacheblock "$ip" >/dev/null 2>&1 && ALREADY=1
    else
        iptables -C INPUT -s "$ip" -j DROP >/dev/null 2>&1 && ALREADY=1
    fi

    [ "$ALREADY" -eq 1 ] && continue

if [[ "$ip" == *":"* ]]; then
    # IPv6
    if command -v ip6tables >/dev/null 2>&1; then
        ip6tables -C INPUT -s "$ip" -j DROP 2>/dev/null || \
        ip6tables -I INPUT 1 -s "$ip" -j DROP
        METHOD="ip6tables"
    else
        echo "$(date '+%F %T') WARN: ip6tables not available for $ip" >> "$BLOCKLOG"
        continue
    fi
else
    # IPv4
    if [ "$USE_IPSET" -eq 1 ]; then
        ipset add apacheblock "$ip" timeout 86400
        METHOD="ipset"
    else
        iptables -C INPUT -s "$ip" -j DROP 2>/dev/null || \
        iptables -I INPUT 1 -s "$ip" -j DROP
        METHOD="iptables"
    fi
fi
    echo "$(date '+%F %T') BLOCKED $ip via $METHOD score=$score reqs=$reqs unique_paths=$unique_count sample=$sample" >> "$BLOCKLOG"
done

rm -f "$WORKFILE"
exit 0
