GNU Parallel을 사용하여 대규모 블록 목록 처리

GNU Parallel을 사용하여 대규모 블록 목록 처리

이는 진행 중인 프로젝트이며 목록 처리 속도를 높이기 위해 중요한 업데이트가 이루어졌습니다. 여기있어홈페이지관심있는 분들을 위해!

목록 내용을 원시 호스트로 변환 lists.json하고 해당 호스트를 해당 작업 방법 및 호스트 형식과 일치하는 목록에 배치하여 정의된 목록을 처리합니다. 상단의 상수는 build_lists.bash각 변수의 정의를 보여줍니다.

속도를 높이거나 버그를 수정하는 방법에 대한 제안을 환영합니다! 또한 아래에서 프로젝트를 실행하는 것이 좋습니다.이 도커 환경. 여기에 제공된 모든 파일을 동일한 디렉토리에 배치하고 ./build_lists.bash모든 스크립트에 실행 권한을 부여한 후 실행하십시오. useragent를 변경하는 것도 권장됩니다 aria2.conf.

입력하다:

스크립트:

build_lists.bash

#!/usr/bin/env bash

#shopt -s extdebug     # or --debugging
set +H +o history     # disable history features (helps avoid errors from "!" in strings)
shopt -u cmdhist      # would be enabled and have no effect otherwise
shopt -s execfail     # ensure interactive and non-interactive runtime are similar
shopt -s extglob      # enable extended pattern matching (https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html)
set -euET -o pipefail # put bash into strict mode & have it give descriptive errors
umask 055             # change all generated file perms from 755 to 700

DOWNLOADS=$(mktemp -d)
TMP=$(mktemp -p "$DOWNLOADS")
METHOD_ALLOW='ALLOW'
METHOD_BLOCK='BLOCK'
FORMAT_DOMAIN='DOMAIN'
FORMAT_CIDR4='CIDR4'
FORMAT_CIDR6='CIDR6'
FORMAT_IPV4='IPV4'
FORMAT_IPV6='IPV6'
readonly DOWNLOADS TMP METHOD_ALLOW METHOD_BLOCK FORMAT_DOMAIN FORMAT_CIDR4 FORMAT_CIDR6 FORMAT_IPV4 FORMAT_IPV6

METHODS=("$METHOD_BLOCK" "$METHOD_ALLOW")
FORMATS=("$FORMAT_DOMAIN" "$FORMAT_IPV4" "$FORMAT_IPV6" "$FORMAT_CIDR4" "$FORMAT_CIDR6")
readonly -a METHODS
readonly -a FORMATS

# https://github.com/ildar-shaimordanov/perl-utils#sponge
sponge() {
    perl -ne '
    push @lines, $_;
    END {
        open(OUT, ">$file")
        or die "sponge: cannot open $file: $!\n";
        print OUT @lines;
        close(OUT);
    }
    ' -s -- -file="$1"
}

sorted() {
    parsort -bfiu -S 100% -T "$DOWNLOADS" "$1" | sponge "$1"
    echo "[INFO] Optimized: ${1}"
}

# params: blacklist, whitelist
apply_whitelist() {
    # https://askubuntu.com/a/562352
    # send each line into the temp file as it's processed instead of keeping it in memory
    parallel --pipe -k -j+0 grep --line-buffered -Fxvf "$2" - <"$1" >>"$TMP"
    cp "$TMP" "$1"
    : >"$TMP"
    echo "[INFO] Applied whitelist to: ${1}"
}

# params: ip list, cidr whitelist
apply_cidr_whitelist() {
    if test -f "$1"; then
        sem -j+0 grepcidr -vf "$2" <"$1" | sponge "$1"
        sem --wait
        echo "[INFO] Applied CIDR whitelist to: ${1}"
    fi
}

init() {
    trap 'rm -rf "$DOWNLOADS"' EXIT || exit 1
    mkdir -p build/
    : >logs/aria2.log
    chmod -t /tmp
}

cleanup() {
    chmod +t /tmp
}

main() {
    local cache
    local list
    local blacklist
    local results

    init

    for method in "${METHODS[@]}"; do
        cache="${DOWNLOADS}/${method}"

        echo "[INFO] Processing method: ${method}"

        set +e # temporarily disable strict fail, in case downloads fail
        jq -r --arg method "$method" 'to_entries[] |
            select(.value.content.retriever == "ARIA2" and .value.method == $method) |
            {key, mirrors: .value.mirrors} |
            (.mirrors | join("\t")), " out=\(.key)"' lists.json |
            aria2c -i- -d "$cache" --conf-path='./aria2.conf'
        set -e

        echo "[INFO] Downloaded ${method} lists!"

        for format in "${FORMATS[@]}"; do
            results="${cache}/${format}"
            mkdir -p "$results"

            echo "[INFO] Sending list results to: ${results}"

            find -P -O3 "$cache" -maxdepth 1 -type f -print0 |
                # https://www.gnu.org/software/parallel/parallel_tutorial.html#controlling-the-execution
                parallel -0 --use-cpus-instead-of-cores --jobs 0 --results "$results" -X ./apply_filters.bash {} "$method" "$format"

            list="build/${method}_${format}.txt"

            echo "[INFO] Processed: ${list}"

            find -P -O3 "$results" -type f -name stdout -exec cat -s {} + | sponge "$list"

            if [ -f "$list" ] && [ -s "$list" ]; then
                sorted "$list"

                if [[ "$method" == "$METHOD_ALLOW" ]]; then
                    blacklist="build/BLOCK_${format}.txt"
                    echo "[INFO] Applying whitelist: ${list}"

                    case "$format" in
                    "$FORMAT_CIDR4")
                        apply_cidr_whitelist "$blacklist" "$list"
                        apply_cidr_whitelist "build/BLOCK_IPV4.txt" "$list"
                        ;;
                    "$FORMAT_CIDR6")
                        apply_cidr_whitelist "$blacklist" "$list"
                        apply_cidr_whitelist "build/BLOCK_IPV6.txt" "$list"
                        ;;
                    *)
                        apply_whitelist "$blacklist" "$list"
                        ;;
                    esac
                else
                    # Remove IPs from the IP blacklists that are covered by the CIDR blacklists
                    case "$format" in
                    "$FORMAT_CIDR4")
                        apply_cidr_whitelist "build/BLOCK_IPV4.txt" "$list"
                        ;;
                    "$FORMAT_CIDR6")
                        apply_cidr_whitelist "build/BLOCK_IPV6.txt" "$list"
                        ;;
                    *) ;;
                    esac
                fi

                echo "[INFO] Processed ${method} ${format} list!"
            fi
        done
    done

    # https://superuser.com/questions/191889/how-can-i-list-only-non-empty-files-using-ls
    find -P -O3 ./build/ -size 0 -type f -name "*.txt" -exec rm {} \; # remove any empty lists
    find -P -O3 ./build/ -type f -name "*.txt" -exec sha256sum {} \; | sponge './build/CHECKSUMS.txt'

    cleanup
}

# https://github.com/koalaman/shellcheck/wiki/SC2218
main

apply_filters.bash

#!/usr/bin/env bash

get_ipv4s() {
    ipinfo grepip -4hox --nocolor
}

get_ipv6s() {
    ipinfo grepip -6hox --nocolor
}

get_domains_from_urls() {
    perl -MData::Validate::Domain=is_domain -MRegexp::Common=URI -nE 'while (/$RE{URI}{HTTP}{-scheme => "https?|udp"}{-keep}/g) {say $3 if is_domain($3, { domain_private_tld => { onion => 1 } })}' 2>/dev/null
}

get_ipv4s_from_urls() {
    perl -MData::Validate::IP=is_ipv4 -MRegexp::Common=URI -nE 'while (/$RE{URI}{HTTP}{-scheme => "https?|udp"}{-keep}/g) {say $3 if is_ipv4($3)}' 2>/dev/null
}

hostsblock() {
    gawk 'BEGIN{FS="[|^]"}/^\|\|([[:alnum:]_-]{1,63}\.)+[[:alpha:]]+\^(\$third-party)?$/{print tolower($3)}'
}

# params: column number
mlr_cut_col() {
    mlr --csv --skip-comments -N clean-whitespace then cut -f "$1"
}

process_list() {
    local FILE_PATH
    local LIST_METHOD
    local CONTENT_FILTER
    local CONTENT_TYPE
    local LIST_FILTER
    local LIST_FORMAT

    FILE_PATH="$1"
    LIST_METHOD="$2"
    CONTENT_FILTER="$3"
    CONTENT_TYPE="$4"
    LIST_FILTER="$5"
    LIST_FORMAT="$6"

    case "$CONTENT_FILTER" in
    'NONE') cat -s "$FILE_PATH" ;;
    '7Z') 7za -y -so e "$FILE_PATH" ;;
    'ZIP') zcat "$FILE_PATH" ;;
    'GZIP') gzip -cd "$FILE_PATH" ;;
    'TARBALL') tar -xOzf "$FILE_PATH" ;;
    'SQUIDGUARD') tar -xOzf "$FILE_PATH" --wildcards-match-slash --wildcards '*/domains' ;;
    'SCAFROGLIA') unzip -p "$FILE_PATH" blocklists-master/*.txt ;;
    'SHADOWWHISPERER') unzip -p "$FILE_PATH" BlockLists-master/RAW/* ;;
    'ESOX_LUCIUS') unzip -p "$FILE_PATH" PiHoleblocklists-main/* -x PiHoleblocklists-main/LICENSE PiHoleblocklists-main/README.md ;;
    esac |
        case "$CONTENT_TYPE" in
        'TEXT')
            case "$LIST_FILTER" in
            'NONE') cat -s ;;
            'RAW_HOSTS_WITH_COMMENTS') mawk '/^[^[:space:]|^#|^!|^;|^$|^:]/{print $1}' ;;
            'HOSTS_FILE') ghosts -m /dev/stdin -o -p -noheader -stats=false ;;
            'ABUSE_CH_URLHAUS_DOMAIN') get_domains_from_urls ;;
            'ABUSE_CH_URLHAUS_IPV4') get_ipv4s_from_urls ;;
            'ALIENVAULT') mawk -F# '{print $1}' ;;
            'ADBLOCK') hostsblock ;;
            'GREP_IPV4') get_ipv4s ;;
            'GREP_IPV6') get_ipv6s ;;
            'BOTVIRJ_IPV4') mawk -F'|' '{print $1}' ;;
            'CRYPTOLAEMUS_DOMAIN') hxextract code /dev/stdin | head -n -1 | tail -n +6 ;;
            'CRYPTOLAEMUS_IPV4') hxextract code /dev/stdin | head -n -1 | tail -n +6 | get_ipv4s ;;
            'CYBERCRIME_DOMAIN') mawk -F/ '{print $1}' ;;
            'CYBERCRIME_IPV4') mawk -F/ '{split($1,a,":");print a[1]}' | get_ipv4s ;;
            'DATAPLANE_IPV4') mawk -F'|' '$0~/^[^#]/{gsub(/ /,""); print $3}' ;;
            'DSHIELD') mlr --tsv --skip-comments -N put '$cidr = $1 . "/" . $3' then cut -f cidr ;;
            'MYIP_DOMAIN') mawk -F, '$0~/^[^#]/{print $2}' ;;
            'MYIP_IPV4') mawk '$0~/^[^#]/{print $1}' | get_ipv4s ;;
            'MYIP_IPV6') mawk '$0~/^[^#]/{print $1}' | get_ipv6s ;;
            'VXVAULT_DOMAIN') mawk '/^[http]/' | get_domains_from_urls ;;
            'VXVAULT_IPV4') mawk '/^[http]/' | get_ipv4s_from_urls ;;
            'XFILES') tr -d "[:blank:]" | hostsblock | mawk '{print $2}' ;;
            'TRACKERSLIST') mawk '{print $1}' | get_domains_from_urls ;;
            'CHARLES_B_HALEY') mawk '$0~/^[^#]/{print $3}' ;;
            'QUANTUMULTX') mawk -F, '$1~/^HOST-SUFFIX$/{print $2}' ;;
            'QUINDECIM') mawk -F= '$0~/^=/{print $2}' | mawk '{print $1}' ;;
            'ZEEK_DOMAIN') mawk '/^[^[:space:]|^#]/&&$2~/^Intel::DOMAIN$/{print $1}' ;;
            'ZEEK_IPV4') mawk '/^[^[:space:]|^#]/&&$2~/^Intel::ADDR$/{print $1}' ;;
            esac
            ;;
        'JSON')
            case "$LIST_FILTER" in
            'ABUSE_CH_FEODOTRACKER_IPV4') jq -r '.[].ip_address' ;;
            'ABUSE_CH_FEODOTRACKER_DOMAIN') jq -r '.[] | select(.hostname != null) | .hostname' ;;
            'ABUSE_CH_THREATFOX_IPV4') jq -r 'to_entries[].value[].ioc_value | split(":")[0]' ;;
            'ABUSE_CH_THREATFOX_DOMAIN') jq -r 'to_entries[].value[].ioc_value' ;;
            'AYASHIGE') jq -r '.[].fqdn' ;;
            'CYBER_CURE_IPV4') jq -r '.data.ip[]' ;;
            'CYBERSAIYAN_DOMAIN') jq -r '.[] | select(.value.type == "URL") | .indicator' | get_domains_from_urls ;;
            'CYBERSAIYAN_IPV4') jq -r '.[] | select(.value.type == "URL") | .indicator' | get_ipv4s_from_urls ;;
            'DISCONNECTME_ENTITIES') jq -r '.entities[] | "\(.properties[])\n\(.resources[])"' ;;
            'DISCONNECTME_SERVICES') jq -r '.categories[] | to_entries[].value[] | to_entries[].value[]' ;;
            'HIPO_UNIVERSITIES') jq -r '.[].domains | join("\n")' ;;
            'ISCSANS') jq -r '.[].ipv4' ;;
            'MALSILO_DOMAIN') jq -r '.data[].network_traffic | select(.dns != null) | .dns[]' ;;
            'MALSILO_IPV4') jq -r '.data[].network_traffic | select(.tcp != null) | .tcp[] | split(":")[0]' ;;
            'MALTRAIL') jq -r '.[].ip' ;;
            'TINYCHECK_DOMAIN') jq -r '.iocs[] | select(.type == "domain") | .value' ;;
            'TINYCHECK_FREEDNS') jq -r '.iocs[] | select(.type == "freedns") | .value' ;;
            'TINYCHECK_IPV4') jq -r '.iocs[] | select(.type == "ip4addr") | .value' ;;
            'TINYCHECK_CIDR') jq -r '.iocs[] | select(.type == "cidr") | .value' ;;
            'CHONG_LUA_DAO_DOMAIN') jq -r '.[].url' | get_domains_from_urls ;;
            'CHONG_LUA_DAO_IPV4') jq -r '.[].url' | get_ipv4s_from_urls ;;
            'INQUEST_DOMAIN') jq -r '.data[] | select(.artifact_type == "domain") | .artifact' ;;
            'INQUEST_IPV4') jq -r '.data[] | select(.artifact_type == "ipaddress") | .artifact' ;;
            'CERTEGO') jq -rs '.[].links[].url' | mawk -F/ '$5~/^domain$/{print $6}' ;;
            'SECUREDROP') jq -r '.[] | .onion_address as $onion | .organization_url | split("/")[2] as $org | $org, $onion' ;;
            esac
            ;;
        'CSV')
            case "$LIST_FILTER" in
            'MLR_CUT_1') mlr_cut_col 1 ;;
            'MLR_CUT_2') mlr_cut_col 2 ;;
            'MLR_CUT_3') mlr_cut_col 3 ;;
            'MLR_CUT_4') mlr_cut_col 4 ;;
            'BENKOW_DOMAIN') mlr --csv --headerless-csv-output --ifs ';' cut -f url | get_domains_from_urls ;;
            'BENKOW_IPV4') mlr --csv --headerless-csv-output --ifs ';' cut -f url | get_ipv4s_from_urls ;;
            'BOTVIRJ_COVID') mawk 'NR>1' ;;
            'CYBER_CURE_DOMAIN_URL') tr ',' '\n' | get_domains_from_urls ;;
            'MALWARE_DISCOVERER_DOMAIN') mlr --csv --headerless-csv-output cut -f domain ;;
            'MALWARE_DISCOVERER_IPV4') mlr --csv --headerless-csv-output cut -f ip ;;
            'PHISHSTATS_DOMAIN') mlr_cut_col 3 | get_domains_from_urls ;;
            'PHISHSTATS_IPV4') mlr_cut_col 4 | get_ipv4s ;;
            'PHISHSTATS_IPV6') mlr_cut_col 4 | get_ipv6s ;;
            'TURRIS') mlr --csv --headerless-csv-output --skip-comments cut -f Address ;;
            'VIRIBACK_DOMAIN') mlr --csv --headerless-csv-output cut -f URL | get_domains_from_urls ;;
            'VIRIBACK_IPV4') mlr --csv --headerless-csv-output cut -f IP ;;
            'SHADOWSERVER_HOST') mlr --csv --headerless-csv-output cut -f http_host ;;
            'SHADOWSERVER_TARGET') mlr --csv --headerless-csv-output cut -f redirect_target ;;
            'WATCHLIST_INTERNET') mlr --csv --ifs ';' -N cut -f 1 ;;
            'CRUZ_IT') mlr --csv --headerless-csv-output clean-whitespace then cut -f ip_address ;;
            'PHISHTANK') mlr --csv --headerless-csv-output cut -f url | get_domains_from_urls ;;
            'BLOCKLIST_UA') mlr --csv --ifs ';' --headerless-csv-output cut -f IP ;;
            esac
            ;;
        'YAML')
            case "$LIST_FILTER" in
            'CRYPTOSCAMDB_BLACKLIST') yq '.[].name' ;;
            'CRYPTOSCAMDB_WHITELIST') yq '.[].url' | get_domains_from_urls ;;
            esac
            ;;
        esac | mawk 'NF && !seen[$0]++' |
        case "$LIST_FORMAT" in
        'DOMAIN')
            perl ./process_domains.pl 2>/dev/null
            ;;
        # https://metacpan.org/pod/Data::Validate::IP
        'IPV4')
            case "$LIST_METHOD" in
            'BLOCK')
                perl -MData::Validate::IP=is_public_ipv4 -nE 'chomp; if(defined($_) && is_public_ipv4($_)) {say $_;}'
                ;;
            # Ensure bogons get whitelisted
            'ALLOW')
                perl -MData::Validate::IP=is_ipv4 -nE 'chomp; if(defined($_) && is_ipv4($_)) {say $_;}'
                ;;
            esac
            ;;
        'IPV6')
            case "$LIST_METHOD" in
            'BLOCK')
                perl -MData::Validate::IP=is_public_ipv6 -nE 'chomp; if(defined($_) && is_public_ipv6($_)) {say $_;}'
                ;;
            # Ensure bogons get whitelisted
            'ALLOW')
                perl -MData::Validate::IP=is_ipv6 -nE 'chomp; if(defined($_) && is_ipv6($_)) {say $_;}'
                ;;
            esac
            ;;
        'CIDR4')
            perl ./process_cidrs.pl 2>/dev/null
            ;;
        'CIDR6')
            perl ./process_cidrs.pl 2>/dev/null
            ;;
        esac
}

main() {
    jq -r --arg key "$(basename "$1")" --arg format "$3" 'to_entries[] |
        select(.key == $key) | .value |
        .content.filter as $content_filter |
        .content.type as $content_type |
        .formats[] |
        select(.format == $format) |
        "\($content_filter)#\($content_type)#\(.filter)"' lists.json |
        while IFS='#' read -r content_filter content_type list_filter; do
            process_list "$1" "$2" "$content_filter" "$content_type" "$list_filter" "$3"
        done
}

main "$1" "$2" "$3"

process_domains.pl

#!/usr/bin/env perl

use warnings;
use strict;
use open ':std', ':encoding(UTF-8)';
use feature 'say';
use Try::Tiny;
use Text::Trim 'trim';
use Net::IDN::Encode 'domain_to_ascii';
use Data::Validate::Domain 'is_domain';

while (<>) {
  chomp;

  try {
    my $domain = domain_to_ascii(trim($_));

    if (defined($domain) && is_domain($domain, { domain_private_tld => { onion => 1 } })) {
      say($domain);
    }
  }
}

process_cidrs.pl

#!/usr/bin/env perl

use warnings;
use strict;
use open ':std', ':encoding(UTF-8)';
use feature 'say';
use Try::Tiny;
use Text::Trim 'trim';
use Net::CIDR 'cidrvalidate';

while (<>) {
  chomp;

  try {
    # https://metacpan.org/pod/Net::CIDR#$ip=Net::CIDR::cidrvalidate($ip);
    my $cidr = cidrvalidate(trim($_));
    last if !defined $cidr;
    say $cidr;
  }
}

관련 정보