Corehandler für Linux-Systeme in Python3
Nicht nur, wenn man man Software in C oder C++ entwickelt, muss man im Betrieb mit einem core rechnen. Neben einigen nicht zu bejubelnden Neuerungen in manchen Linux-Systemen (die unter dem Signet 'Security' eingebracht wurden) gibt es ein sehr nützliches feature: Man kann ein eigenes Filterprogramm als corehandler hinterlegen, das dann vom kernel im Falle eines Abbruchs aufgerufen wird und dabei den eigentlichen core als Standard-Eingabe stdin/cin (daher Filterprogramm genannt) erh?lt. Da dies genau zum Zeitpunkt des Abbruchs geschieht, ist dies die beste Gelegenheit, alle m?glichen Informationen rund um den Prozess bzw. rund um das Betriebssystem einzuholen. Darüberhinaus kann man bei der Gelegenheit auch die shared-libraries der Anwendung einsammeln und zum Konvolut der Erhebung hinzufügen. (Damit wird das debuggen der Anwendung auch auf anderen OS-Versionen als der im Produktionsumfeld eingesetzten Version m?glich.)
Neben dem core kümmert sich der w.u. gezeigte corehandler um
- shared-libraries Ermittlung (und Sicherung) via /usr/bin/ldd
- Kopie des abgestürzten Executables
- Kopie der Log- und Trace-files der Anwendung
- /usr/sbin/iptables -n --list
- /bin/netstat
- /usr/sbin/ip addr
- /usr/sbin/ip route
- /usr/sbin/ip 6 route
- /usr/bin/dmesg
- /usr/bin/ps aux
- cat /proc/cpuinfo
- cat /proc/meminfo
- cat /etc/os-release
- cat /etc/hostname
- cat /proc/cmdline
Alle diese Informationen werden in separaten files abgelegt, die zum Schluss in ein gezipptes tar gepackt werden. Zu diesem Zeitpunkt der Verarbeitung w?re dann eine Notifikation (via mail, sms, telegram, oder, wie hier im Script, per checkMK) an die entsprechende Abteilung sinnvoll. (Wir haben nun Maschine, Zeitpunkt und alle Informationen (als Pfad zum tar-Archiv) zu vermelden.)
Die Installation des eigenen corehandlers erfolgt durch folgenden Aufruf:
echo "|/opt/thovid/corephae/bin/corehandler %P %p %e %t" > /proc/sys/kernel/core_pattern
Informationen zu den Aufrufparametern erh?lt man mit "man 5 core".
Der Aufruf erfolgt am besten in dem Start-Script der Anwendung, da auch andere player auf dem System den corehandler setzen k?nnten.
Wie oben angemerkt, erh?lt 'unser' corehandler den eigentlichen core via Standard-Eingabe. Zu beachten ist dabei, dass - sobald die Standard-Eingabe vollst?ndig gelesen ist - die assoziierte proc-Struktur des abgebrochenen Prozesses gel?scht wird. Daher sind alle Zugriffe auf /proc des Prozesses vor dem Verarbeiten des cores zu erledigen.
#!/usr/bin/env python3 # core_handler in python 3 # install (as root): # echo "|$(pwd)/corehandler %P %p %e %t" > /proc/sys/kernel/core_pattern # be careful: the corehandler runs as kernel-module!!! do not access NFS-mounts or java-stuff. # As long as the corehandler runs, no other instance of the cored executable can be spawned! import os import sys import time import shlex import socket import subprocess import ch_utils as util import configparser # first, read from config ip and port of the checkMK-system to send udp-messages to it: UdpIp = None UdpPort = None config = configparser.ConfigParser() try: config.read("/opt/thovid/corephae/corehandler.cfg") UdpIp = config["checkMK"]["ip"] UdpPort = int(config["checkMK"]["port"]) print("ip=%s, port=%d" % (UdpIp, UdpPort)) except: pass # we have to expand the local path, as SuSe has multiple places where to store system-utils... os.environ["PATH"] = "/bin:/sbin:/usr/sbin:/usr/bin:/usr/local/bin" # the 4 command-line-args (%P %p %e %t): arg_ipid = int(sys.argv[1]) arg_pid = int(sys.argv[2]) arg_binary = str(sys.argv[3]) arg_timestamp = str(sys.argv[4]) workdir = "/var/opt/thovid/core_storage/pm.%s.%d.%s/" % (arg_binary, arg_pid, arg_timestamp) try: os.makedirs(workdir) except: print("Failed to creat workdir %s - EXIT now" % workdir) sys.exit(2) msg = open(workdir + "info_file", 'w+') # the message file proc_exe_path = "/proc/%d/exe" % (arg_pid) executable = os.readlink(proc_exe_path) if len(executable) == 0: print("corehandler: could'nt get executable by proc-map - early exit", file = msg) sys.exit(4) print("executable by proc-map = %s" % (executable), file = msg) bin_path = os.path.dirname(executable) binary = os.path.basename(executable) print("binary path by proc-map = %s, basename = %s" % (bin_path, binary), file = msg) # access the proc's /proc/-subtree as long it exists: proc_so_map = "/proc/%d/maps" % (arg_pid) util.fetch_cmd(["cat", proc_so_map], workdir + "proc_so_map", msg) # extract the shared libraries used via 'ldd': libdir = workdir + "libs" # the local root, where to store the shared libraries try: os.makedirs(libdir) except: pass util.fetch_cmd(["ldd", executable], workdir + "so_list", msg) with open(workdir + "so_list", 'r') as so: for line in so.readlines(): tokens = line.split() if len(tokens) > 3 and tokens[2].startswith("/"): a_so_lib = tokens[2] so_target_dir = libdir + os.path.dirname(a_so_lib) so_target = libdir + a_so_lib print("so: %s" % a_so_lib, file=msg) print("so-target_dir=%s" % so_target_dir, file=msg) print("so-target=%s" % so_target, file=msg) try: os.makedirs(so_target_dir) os.mkdir(so_target_dir) except: pass cmd = "cp %s %s" % (a_so_lib, so_target) rc = subprocess.call(cmd, shell=True) # create a conf-file for the gdb that expands the search-path for shared libs by our storage: with open(workdir + "gdb.conf", 'w') as gdb: print("set sysroot libs\nset substitute-path /ffmbuild/systest .\n", file=gdb) # /// additional info: util.fetch_cmd(["iptables", str("--list"), "-n"], workdir + "iptables", msg) util.fetch_cmd(["ip6tables", str("--list"), "-n"], workdir + "ip6tables", msg) util.fetch_cmd(["ss"], workdir + "netstat", msg) util.fetch_cmd(["ip", "addr"], workdir + "ip_addr", msg) util.fetch_cmd(["ip", "route"], workdir + "ip_route", msg) util.fetch_cmd(["ip", "-6", "route"], workdir + "ip_route_6", msg) util.fetch_cmd(["dmesg"], workdir + "kernel_dmesg", msg) util.fetch_cmd(["ps", "-aux"], workdir + "processes", msg) util.fetch_cmd(["cat", "/proc/cpuinfo"], workdir + "proc_cpu_info", msg) util.fetch_cmd(["cat", "/proc/meminfo"], workdir + "proc_mem_info", msg) util.fetch_cmd(["cat", "/etc/os-release"], workdir + "etc_os_release", msg) util.fetch_cmd(["cat", "/etc/hostname"], workdir + "etc_hostname", msg) util.fetch_cmd(["cat", "/proc/cmdline"], workdir + "kernel_cmd_line", msg) # cp the excutable: try: cmd = "cp %s %s" % (executable, workdir + binary) print("corehandler going to exec <%s>" % (cmd), file=msg) rc = subprocess.call(cmd, shell=True) print("corehandler copied binary from %s to %s ended with rc=%d" % (executable, workdir + binary, int(rc)), file=msg) except: print("corehandler failed to cp executable (%s) to workdir (%s)" % (ececutable, workdir + binary), file=msg) # Call the application specific "collect_appl_crash_info"-file if it exists: cmd_path = bin_path + "/collect_appl_crash_info" cmd = cmd_path + ' ' + workdir print("corehandler: checking for crash_info-script <%s>" % (cmd_path), file=msg) if os.path.isfile(cmd_path): try: rc = subprocess.call(cmd, shell=True) print("corehandler: executed <%s> with rc=%d" % (cmd, int(rc)), file=msg) except: pass else: print("corehandler: <%s> does not exist" % (cmd_path), file=msg) # Notify cerberus (checkMK) via UDP: if UdpIp != None: date = time.asctime(time.localtime(time.time())) message = str("linux coredump of <%s> on <%s> at %s, storage: %s" % (str(binary), str(socket.gethostname()), str(date), workdir + "tar.gz")) sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) try: sock.sendto(message.encode(), (UdpIp, UdpPort)) print("corehandler: notified cerberus with '%s' ..." % (message), file=msg) except: print("corehandler: failed to notify cerberus (%s:%d) (message was '%s')." % (UdpIp, UdpPort, message), file=msg) else: print("corehandler: does not notify cerberus as it's not configured.", file=msg) t_util = time.process_time() # FINALLY write the core itself. finally, cause after stdin has been consumed, # the /proc//-subtree vanishes away: # /// ------------------------------------------ write the core core_file_name = workdir + "core" written = 0 sum = 0 with open(core_file_name, "ab") as f: while True: core_data = sys.stdin.buffer.read(65536) if len(core_data) < 1: break; else: written = f.write(core_data) sum = sum + written t_sum = time.process_time() print("core data written into file, %d bytes" % (sum), file=msg) # /// ------------------------------------------ / write the core print("corehandler: done, t(process)=%lf seconds" % (t_sum), file=msg) print("corehandler: core-creation took %lf, anything else took %lf" % (t_sum - t_util, t_util), file=msg) msg.flush() try: # invoke the compressed tar creator as backgound-job: p = subprocess.Popen(['/opt/thovid/corephae/creat_compressed_tar.sh', workdir], cwd=workdir, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() # returns byteareas print("corehandler: invoked compressed-tar-creation in backgound, stdout=%s, stderr=%s", stdout, stderr, file=msg) except: print("corehandler: failed to invoke compressed-tar-creation in backgound", file=msg) msg.close() # finally, close message file
Optimierungspotential: Man k?nnte das executable auch schon beim Start kopieren, zum Zeitpunkt des cores referenzieren (man hat ja die pid). Grund: Ist die Anwendung l?ngere Zeit gelaufen, k?nnte das executable l?ngst ausgetauscht worden sein - in diesem Fall würden wir das neue executable kopieren, nicht das ge-core-te!
Zu dem eigentlichen corehandler geh?rt noch ein kleines Submodul, ch_utils.py:
import subprocess import sys import string import threading import os.path # executor of given commandline. The commandline has to be an array def execute(cmd, directory): proc = subprocess.Popen( cmd, cwd=directory, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = proc.communicate() # returns byteareas try: stdout = stdout.decode('utf-8') except: pass return proc.returncode, str(stdout) # execute a command and read its output, write into 'file', trace at msg_fd def fetch_cmd(cmd, file, msg_fd): rc, result = execute(cmd, ".") result = result.replace('\\n', '\n') result = result.replace('\\t', '\t') with open(file, "w") as x: for item in result: x.write("%s" % item) print(str("Retrieved data from command %s into file %s" % (cmd, file)), file=msg_fd) ~