This repository has been archived by the owner on May 22, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 9
/
scan_engine.sh
executable file
·205 lines (181 loc) · 6.84 KB
/
scan_engine.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#!/bin/bash
#
# This script is what gets the list of domains to scan,
# runs the scanners, collects the data, and puts it into s3.
# It also cleans up old scans (>4 days) to prevent clutter; 4 days is chosen
# to be aggressively conservative with disk usage on the Elasticsearch cluster,
# but larger time periods are possible.
#
# It is meant to be run like so:
# cf run-task scanner-ui /app/scan_engine.sh -m 2048M
# This is where you add more scan types
SCANTYPES="
pagedata
200scanner
uswds2
sitemap
privacy
dap
third_parties
pshtt
"
# This is where you set the repo/branch
DOMAINSCANREPO="https://github.com/18F/domain-scan"
BRANCH="lighthouse-scan-initial"
# How many days to keep around in the index
INDEXDAYS=4
NUMSCANS=$(echo "$SCANTYPES" | wc -l)
let "INDEXLINES=$INDEXDAYS * ($NUMSCANS - 2)"
# make sure the credentials are set
if [ -z "$AWS_ACCESS_KEY_ID" ] ; then
AWS_ACCESS_KEY_ID=$(echo "$VCAP_SERVICES" | jq -r '.s3[0].credentials.access_key_id')
export AWS_ACCESS_KEY_ID
fi
if [ -z "$AWS_SECRET_ACCESS_KEY" ] ; then
AWS_SECRET_ACCESS_KEY=$(echo "$VCAP_SERVICES" | jq -r '.s3[0].credentials.secret_access_key')
export AWS_SECRET_ACCESS_KEY
fi
if [ -z "$AWS_DEFAULT_REGION" ] ; then
AWS_DEFAULT_REGION=$(echo "$VCAP_SERVICES" | jq -r '.s3[0].credentials.region')
export AWS_DEFAULT_REGION
fi
if [ -z "$ESURL" ] ; then
ESURL=$(echo "$VCAP_SERVICES" | jq -r '.elasticsearch56[0].credentials.uri')
fi
# make sure we have all the arguments we need
if [ -z "$BUCKETNAME" ] ; then
BUCKET=$(echo "$VCAP_SERVICES" | jq -r '.s3[0].credentials.bucket')
if [ -z "$BUCKET" ] ; then
echo "no bucket supplied"
echo "usage: $0 <s3 bucket name>"
echo "example: $0 scanbucket"
exit 1
fi
else
BUCKET="$BUCKETNAME"
fi
cd /app
# set up domain-scan
echo "installing domain-scan: this repo is big, so it can take a while"
if [ -d domain-scan ] ; then
echo "found existing domain-scan dir, syncing it as a shortcut"
cd domain-scan
git checkout "$BRANCH"
else
git clone --depth=1 --branch "$BRANCH" "$DOMAINSCANREPO"
cd domain-scan
fi
# install more packages for the chrome headless stuff
# These _should_ be already installed by the apt buildpack, but this is here to ensure
# that the test environment gets them too.
apt-get update
apt-get install -y awscli gconf-service libasound2 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 libgcc1 libgconf-2-4 libgdk-pixbuf2.0-0 libglib2.0-0 libgtk-3-0 libnspr4 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 ca-certificates fonts-liberation libappindicator1 libnss3 lsb-release xdg-utils wget
# install node stuff for third_parties plugin
if [ -d ../node_modules ] ; then
# npm install was already done by the buildpack, let's just use it and save time
ln -s ../node_modules .
fi
npm install
# set to notify Lighthouse scanner the cli path
export LIGHTHOUSE_PATH=/app/node_modules/lighthouse/lighthouse-cli/index.js
export CHROME_PATH=`node -e "console.log(require('puppeteer').executablePath())"`
#export CHROME_PATH=/usr/bin/chromium
# get the domains and split them up. If we were told to process a particular file,
# select it. Otherwise, scan everything.
if [ -d "/home/vcap/tmp/splitdir" ] ; then
TMPDIR="${TMPDIR:-/home/vcap/tmp/splitdir}"
else
TMPDIR="${TMPDIR:-/tmp/splitdir}"
fi
../getdomains.sh "$TMPDIR"
if [ -f "$TMPDIR/$1" ] ; then
DOMAINFILES="$TMPDIR/$1"
else
if [ -d "$TMPDIR/$1" ] ; then
DOMAINFILES=$(ls "$TMPDIR"/*)
else
echo "could not find $1: Aborting!"
exit 1
fi
fi
# clean up old scans (if there are any)
if [ -d ./cache ] ; then
rm -rf cache
mkdir cache
fi
# execute the scans
echo "Scan start: $(date)"
for z in ${DOMAINFILES} ; do
for i in ${SCANTYPES} ; do
if ./scan "$z" --scan="$i" ; then
echo "scan of $i from $z successful"
else
echo "scan of $i from $z errored out for some reason"
fi
done
done
# add metadata and put scan results into ES
echo "Adding metadata and loading scan results into elasticsearch. This can take a while..."
for i in ${SCANTYPES} ; do
DATE=$(date +%Y-%m-%dT%H:%M:%SZ)
SHORTDATE=$(date +%Y-%m-%d)
echo "loading scantype $i with index: $SHORTDATE-$i"
# set the domain field to be a keyword rather than text so we can sort on it
echo '{"settings": {"index.mapping.total_fields.limit": 2000, "index.max_result_window": 2000000}, "mappings": {"scan": {"properties": {"domain": {"type": "keyword"}}}}}' > /tmp/mapping.json
if curl -s -XPUT "$ESURL/$SHORTDATE-$i" -d @/tmp/mapping.json | grep error ; then
echo "problem creating mapping"
fi
# import all of the scans
for j in cache/"$i"/*.json ; do
DOMAIN=$(basename -s .json "$j")
CSVLINE=$(grep -Ei "^$DOMAIN," /tmp/domains.csv)
DOMAINTYPE=$(echo "$CSVLINE" | awk -F, '{print $2}' | tr -d \")
AGENCY=$(echo "$CSVLINE" | awk -F, '{print $3}' | tr -d \")
ORG=$(echo "$CSVLINE" | awk -F, '{print $4}' | tr '\\' '-' | tr -d \")
# add metadata
echo "{\"domain\":\"$DOMAIN\"," > /tmp/scan.json
echo " \"scantype\":\"$i\"," >> /tmp/scan.json
echo " \"domaintype\":\"$DOMAINTYPE\"," >> /tmp/scan.json
echo " \"agency\":\"$AGENCY\"," >> /tmp/scan.json
echo " \"organization\":\"$ORG\"," >> /tmp/scan.json
echo " \"scan_data_url\":\"https://s3-$AWS_DEFAULT_REGION.amazonaws.com/$BUCKET/$i/$DOMAIN.json\"," >> /tmp/scan.json
echo " \"lastmodified\":\"$DATE\"," >> /tmp/scan.json
echo " \"data\":" >> /tmp/scan.json
# This is because you cannot have . in field names in ES,
# so we are replacing them with // for the document we are
# going to index.
cp /tmp/scan.json /tmp/noperiodsscan.json
../deperiodkeys.py "$j" >> /tmp/noperiodsscan.json
echo "}" >> /tmp/noperiodsscan.json
# This is the document that will go into S3.
cat "$j" >> /tmp/scan.json
echo "}" >> /tmp/scan.json
jq . /tmp/scan.json > "$j"
# slurp the data in
if curl -s -XPOST "$ESURL/$SHORTDATE-$i/scan" -d @/tmp/noperiodsscan.json | grep error ; then
echo "problem importing $j: $(cat /tmp/noperiodsscan.json)"
fi
done
done
# delete old indexes in ES
curl -s "$ESURL/_cat/indices" | grep -E '[0-9]{4}-[0-9]{2}-[0-9]{2}-.+' | awk '{print $3}' | sort -rn | head -"$INDEXLINES" > /tmp/keepers
curl -s "$ESURL/_cat/indices" | grep -E '[0-9]{4}-[0-9]{2}-[0-9]{2}-.+' | awk '{print $3}' | while read line ; do
if echo "$line" | grep -Ff /tmp/keepers >/dev/null ; then
echo keeping "$line" index
else
echo deleting "$line" index
curl -s -X DELETE "$ESURL/$line"
echo
fi
done
# put scan results into s3
for i in ${SCANTYPES} ; do
echo "copying $i data to s3://$BUCKET/$i/"
# The S3ENDPOINT thing is so we can send this to a local minio instance for testing
if aws $S3ENDPOINT s3 cp "cache/$i/" "s3://$BUCKET/$i/" --recursive --only-show-errors ; then
echo "copy of $i to s3 bucket successful"
else
echo "copy of $i to s3 bucket errored out"
fi
done
echo "Scan end: $(date)"