-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathkillandmail
executable file
·270 lines (219 loc) · 7.58 KB
/
killandmail
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
#!/usr/bin/env bash
# Warning: this script uses some of the more... esoteric bash syntax
# including arrays, and associative arrays
# the following may be useful:
# declare -a foo # both these declare foo as an array
# foo=() #
# declare -A bar # declare 'bar' as an associative array
# foo+=(blah) # adds blah as an element to the array 'foo'
# bar+=([key]=blah) # adds blah as the element under key to the associative array 'bar'
# ${bar[@]} # expands to all elements of (associative) array 'bar'
# ${!bar[@]} # keys of all elements of (associative) array 'bar'
hostnamef="$(hostname -f)"
if [[ "$hostnamef" =~ legion.ucl.ac.uk ]]; then
this_cluster="Legion"
elif [[ "$hostnamef" =~ grace.ucl.ac.uk ]]; then
this_cluster="Grace"
elif [[ "$hostnamef" =~ thomas.ucl.ac.uk ]]; then
this_cluster="Thomas"
elif [[ "$hostnamef" =~ michael.ucl.ac.uk ]]; then
this_cluster="Michael"
elif [[ "$hostnamef" =~ myriad.ucl.ac.uk ]]; then
this_cluster="Myriad"
elif [[ "$hostnamef" =~ kathleen.ucl.ac.uk ]]; then
this_cluster="Kathleen"
elif [[ "$hostnamef" =~ young.ucl.ac.uk ]]; then
this_cluster="Young"
else
echo "Error: I do not know which cluster this is. Panicking..."
exit 1
fi
get_pid_cmdline () {
sed -e 's_\x0_ _g' "/proc/$1/cmdline"
}
get_pid_owner () {
stat "/proc/$1" -c %U
}
get_pid_parent_pid () {
if [ "$1" -eq 1 ]; then echo "Error: no what are you doing do not kill the root process stop" >&2; kill -9 $$; fi
# Unfortunately we can't exit the script from inside a function, so the script self-terminates
cut -f 4 -d ' ' "/proc/$1/stat"
}
get_pid_children () {
grep "PPid: $1\$" /proc/[0-9]*/status | sed -e 's_^/proc/\([0-9]*\)/status.*_\1_'
# ^- this is a tab, not spaces
}
get_mail_address () {
username="$1"
if [[ "${username:0:5}" == "cours" ]]; then
echo "Warning: this is a course account, and does not have a valid mail address.">&2
echo "[email protected]"
elif [[ "${username:0:3}" == "mmm" ]]; then
# Do a database lookup and if this fails send to nobody.
email_search=$(mmm-to-email $username)
search_success=$?
if [[ "${search_success}" == "0" ]]; then
echo $email_search
else
echo "Warning: this is an external account, and does not have a valid mail address.">&2
echo "[email protected]"
fi
else
echo "[email protected]"
fi
}
while getopts ":fhnb" opt; do
case $opt in
h)
echo "
usage: $0 [options] pid [pid [...]]
Options:
-h show this help message
-f do not ask before killing processes
-n do not send mail
-b show mail that would be sent but do not send it
"
exit 0
;;
f)
option_force="y"
;;
n)
do_not_mail="y"
;;
b)
do_not_mail="y"
show_mail="y"
;;
\?)
echo "Invalid option: -$OPTARG" >&2
exit 4
;;
esac
done
shift $((OPTIND-1));
if [ $# -lt 1 ]; then
echo "Error: no pids provided. Use '$0 -h' to see options and usage." >&2
exit 1
fi
#################
# Okay, done checking and setting up, start with the doing stuff
killed_pids=()
killed_cmdlines=()
killed_owners=()
for pid in "$@"; do
if [ ! -d "/proc/${pid}" ]; then
echo "Error: no process with pid $pid" >&2
continue
fi
echo "Examining process ${pid}..."
pid_owner=$(get_pid_owner "$pid")
pid_cmdline=$(get_pid_cmdline "$pid")
pid_parent_pid=$(get_pid_parent_pid "$pid")
pid_children=($(get_pid_children "$pid") )
echo "Process $pid owned by $pid_owner has command line: $pid_cmdline"
echo " parent: $pid_parent_pid: $(sed -e 's/\x0/ /g' "/proc/$pid_parent_pid/cmdline")"
if [ ${#pid_children[@]} -ne 0 ]; then
echo " and children:"
for child_pid in "${pid_children[@]}"; do
child_cmdline=$(sed -e 's/\x0/ /g' "/proc/$child_pid/cmdline")
printf " %7d: %s\n" "$child_pid" "$child_cmdline"
done
fi
if [ -z "$option_force" ]; then
read -n 1 -r -p "Attempt to kill this process? [y|N]" definitely_kill
echo
# ^- read doesn't add a blank line
else
definitely_kill="y"
fi
if [ "$definitely_kill" != "y" ]; then
echo "Okay, skipping..."
continue
fi
# become user, kill process
# grumble grumble become is trash grumble
echo "Becoming process owner, ${pid_owner}..."
become_output=$(sudo /shared/ucl/sysops/libexec/become "$pid_owner" <<EOF
echo "Beacon"
kill -9 $pid
EOF
)
become_exit_status=$?
if [ "${become_output:0:6}" != "Beacon" ]; then
echo "Error: could not become user ${pid_owner}, failed to kill ${pid}" >&2
else
if [ $become_exit_status -ne 0 ]; then
echo "Error: became user but there was a problem killing the process ${pid}" >&2
else
echo "kill seems successful, checking... "
# if this sleep isn't here, the process seems to not have time to die and clean-up
sleep 10
if [ -d "/proc/${pid}" ]; then # there's still a directory in proc with the pid we just tried to kill
echo "Error: there is still a process with pid ${pid}" >&2
if [ "$(get_pid_cmdline "$pid")" = "$pid_cmdline" ]; then
echo "Error: (ctd.) remaining process has same cmdline so is unlikely to be a new process using the same pid. Try hitting it harder?" >&2
fi
else
echo "Yep, looks killed."
killed_pids+=("$pid")
killed_cmdlines+=("$pid_cmdline")
killed_owners+=("$pid_owner")
fi
fi
fi
done
############
# Finished killing things, now send mails
# v-- associative array
declare -A mail_inserts
# Format things we did into a mail-friendly list
for (( i=0; i<${#killed_pids[@]}; i++ )); do
mail_inserts+=(["${killed_owners[$i]}"]="${mail_inserts[${killed_owners[$i]}]}"$'\n'" ${killed_pids[$i]}: ${killed_cmdlines[$i]}")
done
#echo "Variable inventory before attempting mail step:"
#echo "Killed pids: ${!killed_pids[@]}"
#echo " -> ${killed_pids[@]}"
#echo "Killed cmdlines: ${!killed_cmdlines[@]}"
#echo " -> ${killed_cmdlines[@]}"
#echo "Killed owners: ${!killed_owners[@]}"
#echo " -> ${killed_owners[@]}"
#echo "Mail inserts: ${!mail_inserts[@]}"
for owner in "${!mail_inserts[@]}"; do
echo -e "${mail_inserts[$owner]}"
# Override mailx command with a shell function if we don't want to send mail
if [ -n "$do_not_mail" ]; then
if [ -n "$show_mail" ]; then
mailx() { (echo "$@" && cat); }
else
mailx() { :; }
fi
fi
owner_address=$(get_mail_address "$owner")
if [ -z "$owner_address" ]; then
echo "Error: could not get mail address for user $owner" >&2
exit 3
fi
mailx \
-s "${this_cluster} Login Node Processes Killed for $owner" \
-S [email protected] \
-b [email protected] \
"$owner_address" \
<<EOF
[This is an automated mail.]
One or more of your processes running on ${this_cluster}'s login nodes have been identified as using excessive resources or being a misuse of these nodes, and have been killed.
These were: ${mail_inserts[$owner]}
Most often this is due to attempting to run workloads that should be submitted as a job to the batch system instead. If you require more information, please consult the documentation at http://wiki.rc.ucl.ac.uk/ or contact [email protected]
Regards,
The RC Support Team
EOF
if [ $? -eq 0 ]; then
echo "Successfully sent mail to $owner"
else
echo "Error: failed sending mail attempt to $owner" >&2
fi
# e.g.
# mailx -s "Processes killed: blah" -S [email protected] [email protected] <<<"This is a test mail."
# remember Heather wanted emails to come to us as well to keep track of numbers and repeat offenders
done
# Ends mailer block