Commit 3bdaa2a1 authored by Petr Wehrenberg's avatar Petr Wehrenberg
Browse files

Pylint changes and new data structure parser

parent da3cc1e7
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -3,3 +3,4 @@
.idea
Pipfile.lock
.ipynb_checkpoints/*
__pycache__/*

elasticsearch_connection.py

deleted100644 → 0
+0 −111
Original line number Diff line number Diff line
import pandas as pd
import numpy as np
from elasticsearch import Elasticsearch
import re


class ElasticSearchConnection:

    def __init__(self, host, port):
        self.es = Elasticsearch(HOST=host, PORT=port)
        self.sandbox_ids = []
        self.time_from = None
        self.time_to = None

    def set_sandbox_id(self, sb_id):
        self.sandbox_ids = sb_id

    def set_time_from(self, time_from):
        self.time_from = time_from
        
    def set_time_to(self, time_to):
        self.time_to = time_to

    def fetch_bash_logs(self):
        path_bash = ','.join(list(map(lambda x: 'kypo.logs.console.bash.command.pool=1.sandbox=' + x ,self.sandbox_ids)))
        resp_bash = self.elasticsearch_search('bash logs', path_bash, self.build_date_range_query('timestamp_str'))
        return self.transform_bash_logs_to_df(resp_bash)


    def fetch_bash_logs_raw(self):
        path_bash = ','.join(list(map(lambda x: 'kypo.logs.console.bash.command.pool=1.sandbox=' + x ,self.sandbox_ids)))
        resp_bash = self.elasticsearch_search('bash logs', path_bash, self.build_date_range_query('timestamp_str'))
        return resp_bash

    def sniff_bash_logs(self):
        path_bash = 'kypo.logs.console.bash.command.pool=1.sandbox=*'
        resp_bash = self.elasticsearch_search('bash logs', path_bash, { 'size': 9999 })
        bash_df = self.transform_bash_logs_to_df(resp_bash)
        return bash_df.groupby(['sandbox_id']).agg({ 'timestamp': [np.min, np.max], 'cmd_raw': ['count'] }) 


    def fetch_bash_logs_raw(self):
        path_bash = ','.join(list(map(lambda x: 'kypo.logs.console.bash.command.pool=1.sandbox=' + x ,self.sandbox_ids)))
        resp_bash = self.elasticsearch_search('bash logs', path_bash, self.build_date_range_query('timestamp_str'))
        return resp_bash

    def sniff_events(self):
        path_events = 'kypo.*_evt.*.sandbox=*.definition=*.instance=*.run=*'
        resp_events = self.elasticsearch_search('events', path_events, { 'size': 9999 })
        events_df = self.transform_events_to_df(resp_events)
        return events_df.groupby(['sandbox_id']).agg({ 'timestamp': [np.min, np.max], 'event_type': ['count'] }) 

    def fetch_events(self):
        path_events = ','.join(list(map(lambda x: 'kypo.*_evt.*.sandbox=' + x +  '.definition=*.instance=*.run=*',self.sandbox_ids)))
        resp_events = self.elasticsearch_search('events', path_events, self.build_date_range_query('syslog.@timestamp'))
        return self.transform_events_to_df(resp_events)

    def elasticsearch_search(self, call_name, index, body):
        resp = self.es.search(index=index, body=body, ignore=[404])
        if 'error' in resp:
            print('[!] Request problem in', call_name, '-', self.resolve_elastic_error(resp['error']['type']))
            return []
        else:
            return resp['hits']['hits']

    def build_date_range_query(self, timestamp_field):
        # This query is able to take data in defined range. More information here:
        # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-range-query.html
        return { 'size': 9999, 'query': { 'range': { timestamp_field: { 'gte': self.time_from + '+02:00', 'lte': self.time_to + '+02:00' } } } }


    def transform_bash_logs_to_df(self, bash_logs):
        cols = {'sandbox_id': [], 'timestamp': [], 'cmd_raw': [], 'cmd_type': []}
        for log in bash_logs:
            cols['sandbox_id'].append(re.search('sandbox=(\d*)', log['_index']).group(1))
            cols['timestamp'].append(log['_source']['timestamp_str'])
            cols['cmd_raw'].append(log['_source']['cmd'])
            cols['cmd_type'].append(log['_source']['cmd_type'])
    
        df = pd.DataFrame(cols)
        df['sandbox_id'] = df['sandbox_id'].astype('int')
        df['timestamp'] = df['timestamp'].apply(pd.to_datetime)
        df['cmd_type'] = df['cmd_type'].astype('category')
        df['cmd_raw'] = df['cmd_raw'].astype('string')
        return df


    def transform_events_to_df(self, events):
        cols = {'sandbox_id': [], 'timestamp': [], 'event_type': [], 'level': []}
        for event in events:
            cols['sandbox_id'].append(event['_source']['sandbox_id'])
            cols['timestamp'].append(event['_source']['syslog']['@timestamp'])
            cols['event_type'].append(event['_source']['type'].split('.')[-1])
            cols['level'].append(event['_source']['level'])
    
        df = pd.DataFrame(cols)
        df['sandbox_id'] = df['sandbox_id'].astype('int')
        df['timestamp'] = df['timestamp'].apply(pd.to_datetime)
        df['event_type'] = df['event_type'].astype('category')
        df['level'] = df['level'].astype('int')
        return df

    def resolve_elastic_error(self, error):
        # Resolving known errors to make script more user friendly
        error_switch = {
            'index_not_found_exception': 'Sandbox given on input does not exist'
        }
        if error in error_switch:
            return error_switch[error]
        else:
            return error
+112 −10
Original line number Diff line number Diff line
%% Cell type:markdown id:175dd1a2 tags:

# KYPO Bash Logs Analysis scripts

These scripts help to get fast knowledge about data from KYPO Training or clean them up.

Each script contains its definition with example output, the cell with code, and following cell the example usage. Almost every script offers two functions. One function with analysis implementation usually returns result in Pandas `DataFrame` or some other data structure. The next function with the `show_` prefix sends formatted output to the console.


#### Prequisite
Scripts are dependent on module kypo_elk_scripts_utils and elasticsearch_connection. Be sure these modules are in the same folder, and the notebook is not used independently.

%% Cell type:code id:eda9570c tags:

``` python
import pandas as pd
import numpy as np
from elasticsearch_connection import ElasticSearchConnection
from kypo_data_preprocessor import KYPODataPreprocessor
from kypo_elk_scripts_utils import parse_cmd, format_delta_time, filter_logs_in_level_range
import re
```

%% Cell type:markdown id:c643116e tags:

### Setup
Here you should set up your data features.

%% Cell type:code id:931ae3f1 tags:

``` python
esc = ElasticSearchConnection('http://localhost', '9200')
esc.set_sandbox_id(['130', '131', '128', '129'])
esc = KYPODataPreprocessor('http://localhost', '9200')
esc.set_sandbox_ids(['130', '131', '128', '129'])
esc.set_time_from('2020-04-05T17:20:00')
esc.set_time_to('2020-06-07T17:40:00')
```

%% Cell type:markdown id:a7bb5249 tags:

### Data test

If the data are unclear and you are unsure which Sandbox IDs are included, use these methods (`sniff_events()`, `sniff_bash_logs()`) to explore basic information about data in Elasticsearch.
If the data are unclear and you are unsure which Sandbox IDs are included, use these methods (`explore_events()`, `explore_bash_logs()`) to explore basic information about data in Elasticsearch.

%% Cell type:code id:c5c2c930 tags:

``` python
esc.sniff_events()
esc.explore_events()
```

%% Output

                                      timestamp                                   \
                                           amin                             amax
    sandbox_id
    2          2020-05-06 13:25:14.192000+00:00 2020-05-07 16:11:18.210000+00:00
    14         2021-10-06 12:51:10.361000+00:00 2021-10-06 14:09:56.963000+00:00
    15         2021-10-08 07:55:11.913000+00:00 2021-10-08 10:12:30.217000+00:00
    16         2021-10-08 09:26:00.047000+00:00 2021-10-13 17:10:01.247000+00:00
    17         2021-10-08 14:24:46.476000+00:00 2021-10-08 15:09:42.204000+00:00
    ...                                     ...                              ...
    220        2020-05-08 17:29:01.279000+00:00 2020-05-10 08:16:12.641000+00:00
    239        2020-05-11 10:12:18.504000+00:00 2020-05-11 12:39:47.420000+00:00
    340        2020-05-14 08:04:40.413000+00:00 2020-05-14 10:00:33.557000+00:00
    341        2020-05-14 08:05:20.637000+00:00 2020-05-14 09:39:30.578000+00:00
    342        2020-05-14 08:28:57.394000+00:00 2020-05-14 09:45:12.721000+00:00
    
               event_type
                    count
    sandbox_id
    2                 128
    14                 43
    15                 39
    16                 21
    17                 23
    ...               ...
    220                44
    239                54
    340                33
    341                46
    342                35
    
    [71 rows x 3 columns]

%% Cell type:code id:a819f247 tags:

``` python
esc.sniff_bash_logs()
esc.explore_bash_logs()
```

%% Output

                                      timestamp                                    \
                                           amin                              amax
    sandbox_id
    128        2020-05-06 16:20:12.098331+02:00  2020-05-06 17:37:03.137332+02:00
    129        2020-05-06 16:40:34.978581+02:00  2020-05-06 17:20:25.268125+02:00
    130        2020-05-06 16:26:28.378340+02:00  2020-05-06 17:05:35.286306+02:00
    131        2020-05-06 16:29:29.777667+02:00  2020-05-06 17:30:12.723408+02:00
    172        2020-05-06 19:01:41.358029+02:00  2020-05-07 14:31:01.169936+02:00
    185               2019-12-12 10:11:09+02:00         2019-12-12 11:28:13+02:00
    187               2019-12-12 10:17:36+02:00         2019-12-12 11:19:15+02:00
    188               2019-12-12 10:41:21+02:00         2019-12-12 11:56:51+02:00
    195               2019-12-12 11:38:03+02:00         2019-12-12 11:47:22+02:00
    340        2020-05-14 10:18:23.449882+02:00  2020-05-14 09:58:22.781889+00:00
    341        2020-05-14 10:20:38.723241+02:00  2020-05-14 11:35:39.423294+02:00
    342        2020-05-14 10:34:24.500753+02:00  2020-05-14 11:42:59.800874+02:00
    
               cmd_raw
                 count
    sandbox_id
    128             36
    129              9
    130             10
    131             24
    172            145
    185             56
    187             59
    188             23
    195              6
    340             67
    341             15
    342             70

%% Cell type:markdown id:e84e39d5 tags:

### 2. Show used tools
**Definition**

For a given subset of command logs, show all the toolsthat are used in the logs (optionally with the counts of how often theywere used)

**Explanation**

The function `extract_used_tools` returns DataFrame with extra column `cmd` which contains programme from `cmd_raw` column. The following function  `show_used_tools` prints to console each command count

%% Cell type:code id:e5047046 tags:

``` python
def extract_used_tools(logs_df):
    logs_df['cmd'] = logs_df['cmd_raw'].apply(lambda x: parse_cmd(x)[0])
    return logs_df

def show_used_tools(logs_df):
    print(extract_used_tools(logs_df)['cmd'].value_counts())
```

%% Cell type:code id:fb073f9a tags:

``` python
# Print to console
logs_df = esc.fetch_bash_logs()
show_used_tools(logs_df)

# Create chart from updated data frame
logs_df = extract_used_tools(logs_df)
# logs_df['cmd'].value_counts().plot.bar(rot=0)
```

%% Output

    [!] Request problem in bash logs - Sandbox given on input does not exist
    Series([], Name: cmd, dtype: Int64)
    nmap                20
    python              20
    ls                   7
    python3              5
    mv                   3
    cd                   3
    man                  3
    for                  2
    show                 2
    msf                  2
    msfconsole           2
    mfsconsole           2
    curl                 1
    map                  1
    top                  1
    ~htop                1
    metasploit           1
    grep                 1
    script.py            1
    shooooooooooooow     1
    Name: cmd, dtype: int64

%% Cell type:markdown id:58848fda tags:

### 3. Show all combinations of arguments

**Definition**

For a given tool, show all the combinations (n-tuples) of its arguments that appear in the logs. Note that some arguments are standalone (e.g., –help), and some require an additional parameter(e.g., -p 20).

**Explanation**

The function `combinations_of_args` search for unique command parameters and returns them as an array of strings. The `show_combinations_of_args` prints formated output to console.

%% Cell type:code id:8db63fab tags:

``` python
def combinations_of_args(logs_df, exp_cmd):
    opt_set = set()
    output = []
    for index, row in logs_df.iterrows():
        # print(' '.join(parse_cmd(row['cmd_raw'])[1]))
        parsed_line = parse_cmd(row['cmd_raw'])
        if parsed_line[0] == exp_cmd:
            opt_set.add(' '.join([parsed_line[0]] + parsed_line[1]))
    for opt in opt_set:
        opts = parse_cmd(opt)
        output.append(opts[1])
    return output


def show_combinations_of_args(logs_df, exp_cmd):
    parsed_lines = combinations_of_args(logs_df, exp_cmd)
    for parsed_line in parsed_lines:
        print(exp_cmd + '  -  ' + ' '.join(parsed_line))

```

%% Cell type:code id:402d9398 tags:

``` python
# Print to console
logs_df = esc.fetch_bash_logs()
show_combinations_of_args(logs_df, 'python')

# Get output values
print(combinations_of_args(logs_df, 'nmap'))
```

%% Output

    [!] Request problem in bash logs - Sandbox given on input does not exist
    []
    python  -  172.18.1.5:10000 id script.py
    python  -  "ls" http://172.18.1.5:10000 script.py
    python  -  "cat twofactor_form.cgi" http://172.18.1.5:10000 script.py
    python  -  172.18.1.5:10000/session_login.cgi ls script.py
    python  -  "cat twofactor.pl" http://172.18.1.5:10000 script.py
    python  -  CVE_2019_15107.py https://10.10.20.166:10000
    python  -  http://172.18.1.5:10000 ls script.py
    python  -  https://172.18.1.5:10000 ls script.py
    python  -  "cat CHANGELOG" http://172.18.1.5:10000 script.py
    python  -  CVE_2019_15107.py https://10.10.20.166:10000 ls
    python  -  /root;ls cd http://172.18.1.5:10000 script.py
    python  -  "cat *" > http://172.18.1.5:10000 out.txt script.py
    python  -  172.18.1.5:10000 ls script.py
    python  -  "cd /root;ls" http://172.18.1.5:10000 script.py
    python  -  "twofactor_form.cgi" http://172.18.1.5:10000 script.py
    python  -  "cat *" http://172.18.1.5:10000 script.py
    [['-p 172.18.1.5'], ['-p 10000', '172.18.1.5'], ['-n 172.18.1.0/24', 'sP'], ['172.18.1.5'], ['-p 80', '172.18.1.5'], [], ['-F 172.18.1.0/24', '-n ', 'sP'], ['-n 172.18.1.5', 'sP'], ['-n 172.18.1.0', 'sP'], ['-p 10000', '-sV 172.18.1.5'], ['-p 20000', '172.18.1.5'], ['-F 172.18.1.5'], ['-sV 172.18.1.5'], ['-65535 172.18.1.0/24', 'p0'], ['172.18.1.5', 'p']]

%% Cell type:markdown id:5989531f tags:

### 4: Equivalence classes of commands

**Definition**

For a given subset of command logs, show equivalence classes of commands (considering the order of arguments)

**Explanation**

The function `eq_classes_of_commands` normalizes the command parameters and returns the parameters with equivalent meaning. The following function `show_eq_classes_of_commands` print result to console.

%% Cell type:code id:f4aa2670 tags:

``` python
def eq_classes_of_commands(logs_df):
    logs_df['cmd'] = logs_df['cmd_raw'].apply(lambda x: parse_cmd(x)[0])
    logs_df['norm_params'] = logs_df['cmd_raw'].apply(lambda x: ' '.join(parse_cmd(x)[1]))
    unique_cmds = logs_df.drop_duplicates(subset = ['cmd', 'norm_params'])
    eq_cmds = []
    for index, row in unique_cmds.iterrows():
        eq_rows = logs_df.loc[(logs_df['cmd'] == row['cmd']) & (logs_df['norm_params'] == row['norm_params'])]
        if eq_rows.drop_duplicates(subset = ['cmd_raw'])['cmd'].count() > 1:
            eq = []
            for index, row in eq_rows.iterrows():
                eq.append(row['cmd_raw'])
            eq_cmds.append(eq)
    return eq_cmds

def show_eq_classes_of_commands(logs_df):
    for eq_cmds in eq_classes_of_commands(logs_df):
        print(' == '.join(eq_cmds))
```

%% Cell type:code id:10459ac8 tags:

``` python
logs_df = esc.fetch_bash_logs()
show_eq_classes_of_commands(logs_df)
```

%% Output

    nmap -p 172.18.1.5 == nmap -p  172.18.1.5
    nmap -p-  172.18.1.5 == nmap -p-  172.18.1.5 == nmap -p- 172.18.1.5

%% Cell type:markdown id:59da28c0 tags:

### 5. Disregard the sudo

**Definition**

For commands with "sudo", disregard the "sudo" key-word.

**Explanation**

The function `remove_sudo_from_df` removes the `sudo` prefix in `cmd_raw` column and return `DataFrame`.

%% Cell type:code id:f5fa1a27 tags:

``` python
def remove_sudo(cmd):
    cmd_with_options = cmd.split()
    return ' '.join(cmd_with_options[0:] if cmd_with_options[0] != 'sudo' else cmd_with_options[1:])

def remove_sudo_from_df(logs_df):
    logs_df['cmd_raw'] = logs_df['cmd_raw'].apply(remove_sudo)
    return logs_df
```

%% Cell type:code id:84a529ce tags:

``` python
logs_df = esc.fetch_bash_logs()
remove_sudo_from_df(logs_df)
logs_df
```

%% Output

        sandbox_id                        timestamp  \
    0          128 2020-05-06 16:20:12.098331+02:00
    1          128 2020-05-06 16:35:30.999179+02:00
    2          128 2020-05-06 16:37:35.317756+02:00
    3          128 2020-05-06 16:20:39.407920+02:00
    4          128 2020-05-06 16:24:25.830504+02:00
    ..         ...                              ...
    74         131 2020-05-06 17:22:11.958857+02:00
    75         131 2020-05-06 17:24:08.072683+02:00
    76         131 2020-05-06 17:26:17.803323+02:00
    77         131 2020-05-06 17:26:24.386783+02:00
    78         131 2020-05-06 17:30:12.723408+02:00
    
                                                  cmd_raw      cmd_type
    0                               nmap -p 80 172.18.1.5  bash-command
    1                                  nmap -p 172.18.1.5  bash-command
    2                                     nmap 172.18.1.5  bash-command
    3                            nmap -p 20000 172.18.1.5  bash-command
    4   for i in {1..65535}; do nmap -p ${i} 172.18.1....  bash-command
    ..                                                ...           ...
    74                                      show exploits  bash-command
    75                shooooooooooooow exploiiiiiiiiiiii2  bash-command
    76                                      show exploits  bash-command
    77                                         msfconsole  bash-command
    78                                                 ls  bash-command
    
    [79 rows x 4 columns]

%% Cell type:markdown id:0c787020 tags:

### 6. Show the total duration of the actions
**Definition**

For a given subset of commands, show the total durationof the actions.

**Explanation**

The function `actions_total_duration_for_sandbox` look for each sandbox minimal and maximal time value and returns it as `DataFrame`. The following `show_actions_total_duration_for_sandbox` prints to console output.

%% Cell type:code id:56a0269a tags:

``` python
def actions_total_duration_for_sandbox(logs_df):
    return logs_df.groupby(['sandbox_id']).agg({'timestamp' : [np.min, np.max]})

def show_actions_total_duration_for_sandbox(logs_df):
    actions = actions_total_duration_for_sandbox(logs_df)
    for index, row in actions.iterrows():
        delta = row.timestamp.amax - row.timestamp.amin
        print('Sandbox {} contains logs from {} to {} ({})'.format(row.name, row.timestamp.amin.strftime('%H:%M:%S %d-%m-%Y'), str(row.timestamp.amax.strftime('%H:%M:%S %d-%m-%Y')), format_delta_time(delta)))
```

%% Cell type:code id:f2a750c7 tags:

``` python
# Aggregated DF
logs_df = esc.fetch_bash_logs()
print(actions_total_duration_for_sandbox(logs_df))

# Print to console
logs_df = esc.fetch_bash_logs()
show_actions_total_duration_for_sandbox(logs_df)
```

%% Output

                                      timestamp
                                           amin                             amax
    sandbox_id
    128        2020-05-06 16:20:12.098331+02:00 2020-05-06 17:37:03.137332+02:00
    129        2020-05-06 16:40:34.978581+02:00 2020-05-06 17:20:25.268125+02:00
    130        2020-05-06 16:26:28.378340+02:00 2020-05-06 17:05:35.286306+02:00
    131        2020-05-06 16:29:29.777667+02:00 2020-05-06 17:30:12.723408+02:00
    Sandbox 128 contains logs from 16:20:12 06-05-2020 to 17:37:03 06-05-2020 (1:16:51)
    Sandbox 129 contains logs from 16:40:34 06-05-2020 to 17:20:25 06-05-2020 (0:39:50)
    Sandbox 130 contains logs from 16:26:28 06-05-2020 to 17:05:35 06-05-2020 (0:39:06)
    Sandbox 131 contains logs from 16:29:29 06-05-2020 to 17:30:12 06-05-2020 (1:00:42)

%% Cell type:markdown id:818c7ef5 tags:

### 7. The duration between two actions

**Definition**

Find command logs for which the total duration is lower/higher than a given number.

**Explanation**

The function `actions_total_duration_for_sandbox_to_number` finds the sandboxes, where the difference between the first and last command log is lower or higher than the number in seconds given in parameters. Note that the last parameter must be `"lower"` or `"higher"`. The function `show_actions_total_duration_for_sandbox_to_number` prints result to the console.

%% Cell type:code id:d2973578 tags:

``` python
def actions_total_duration_for_sandbox_to_number(logs_df, time_up, state):
    delta_interval = pd.Timedelta(str(time_up) + ' seconds')
    logs_df = logs_df.groupby(['sandbox_id']).agg({'timestamp' : [np.min, np.max]})
    logs_df['delta'] = (logs_df.timestamp.amax - logs_df.timestamp.amin).fillna(pd.Timedelta('0 seconds'))
    logs_df = logs_df.loc[(logs_df['delta'] <= delta_interval) if state == 'lower' else (logs_df['delta'] >= delta_interval) ]
    return logs_df

def show_actions_total_duration_for_sandbox_to_number(logs_df, time_up, state):
    actions = actions_total_duration_for_sandbox_to_number(logs_df, time_up, state)
    for index, row in actions.iterrows():
        delta = row.timestamp.amax - row.timestamp.amin
        print('Sandbox {} contains logs from {} to {} ({})'.format(row.name, row.timestamp.amin.strftime('%H:%M:%S %d-%m-%Y'), str(row.timestamp.amax.strftime('%H:%M:%S %d-%m-%Y')), format_delta_time(delta)))
```

%% Cell type:code id:3e681062 tags:

``` python
# Aggregated DF
logs_df = esc.fetch_bash_logs()
actions_total_duration_for_sandbox_to_number(logs_df, 50*60, 'higher')

# Print to console
logs_df = esc.fetch_bash_logs()
show_actions_total_duration_for_sandbox_to_number(logs_df, 50*60, 'lower')
```

%% Output

    Sandbox 129 contains logs from 16:40:34 06-05-2020 to 17:20:25 06-05-2020 (0:39:50)
    Sandbox 130 contains logs from 16:26:28 06-05-2020 to 17:05:35 06-05-2020 (0:39:06)

%% Cell type:markdown id:2abf232e tags:

### 8. Duration of the actions to the given number

**Definition**

For a given subset of commands, show the time differences between two successive actions.

**Example**

The function `command_time_duration` extends the `DataFrame` with one column, the time difference between the cmd and cmd before.

%% Cell type:code id:f870d655 tags:

``` python
def command_time_duration(logs_df):
    logs_df = logs_df.sort_values(by = 'timestamp')
    logs_df['delta'] = (logs_df['timestamp']-logs_df['timestamp'].shift()).fillna(pd.Timedelta('0 seconds'))
    logs_df['delta'] = logs_df['delta'].apply(lambda x: format_delta_time(x))
    return logs_df
```

%% Cell type:code id:18ef239b tags:

``` python
logs_df = esc.fetch_bash_logs()
command_time_duration(logs_df)
```

%% Output

        sandbox_id                        timestamp  \
    0          128 2020-05-06 16:20:12.098331+02:00
    3          128 2020-05-06 16:20:39.407920+02:00
    4          128 2020-05-06 16:24:25.830504+02:00
    45         130 2020-05-06 16:26:28.378340+02:00
    46         130 2020-05-06 16:28:30.551719+02:00
    ..         ...                              ...
    78         131 2020-05-06 17:30:12.723408+02:00
    32         128 2020-05-06 17:35:19.914772+02:00
    33         128 2020-05-06 17:36:01.954768+02:00
    34         128 2020-05-06 17:36:34.386631+02:00
    35         128 2020-05-06 17:37:03.137332+02:00
    
                                                  cmd_raw      cmd_type    delta
    0                               nmap -p 80 172.18.1.5  bash-command  0:00:00
    3                            nmap -p 20000 172.18.1.5  bash-command  0:00:27
    4   for i in {1..65535}; do nmap -p ${i} 172.18.1....  bash-command  0:03:46
    45                        sudo nmap -sP -n 172.18.1.0  bash-command  0:02:02
    46                     sudo nmap -sP -n 172.18.1.0/24  bash-command  0:02:02
    ..                                                ...           ...      ...
    78                                                 ls  bash-command  0:00:46
    32                                python3 myscript.py  bash-command  0:05:07
    33                                python3 myscript.py  bash-command  0:00:42
    34                                python3 myscript.py  bash-command  0:00:32
    35                                python3 myscript.py  bash-command  0:00:28
    
    [79 rows x 5 columns]

%% Cell type:markdown id:7cb9072e tags:

### 9. Two actions time difference to the given number

**Definition**

Find command logs for which there exists a time differ-ence between two successive actions that is lower/higher than a given number

**Explanation**

The function `command_time_duration_to_number` filters the commands if the time difference between two commands is lower or higher than the given number. The `show_command_time_duration_to_number` prints these commands to console. The third parameter in functions should be `"lower"` or `"higher"` depending on desired output data.

%% Cell type:code id:1762e53f tags:

``` python
def command_time_duration_to_number(logs_df, time_up, state):
    delta_interval = pd.Timedelta(str(time_up) + ' seconds')
    logs_df = logs_df.sort_values(by = 'timestamp')
    logs_df['cmd_before'] = (logs_df['cmd_raw'].shift()).fillna('---')
    logs_df['delta'] = (logs_df['timestamp'] - logs_df['timestamp'].shift()).fillna(pd.Timedelta('0 seconds'))
    logs_df = logs_df.loc[(logs_df['delta'] <= delta_interval) if state == 'lower' else (logs_df['delta'] >= delta_interval) ]
    logs_df['delta'] = logs_df['delta'].apply(lambda x: format_delta_time(x))
    return logs_df

def show_command_time_duration_to_number(logs_df, time_up, state):
    logs_df = command_time_duration_to_number(logs_df, time_up, state)
    for index, row in logs_df.iterrows():
        print('{} - {:60} --> {:40}'.format(str(row.delta), row.cmd_before, row.cmd_raw))
```

%% Cell type:code id:b1626390 tags:

``` python
logs_df = esc.fetch_bash_logs()
command_time_duration_to_number(logs_df, 20, 'lower')

logs_df = esc.fetch_bash_logs()
show_command_time_duration_to_number(logs_df, 20, 'higher')
```

%% Output

    0:00:27 - nmap -p 80 172.18.1.5                                        --> nmap -p 20000 172.18.1.5
    0:03:46 - nmap -p 20000 172.18.1.5                                     --> for i in {1..65535}; do nmap -p ${i} 172.18.1.5 >> out.txt; done
    0:02:02 - for i in {1..65535}; do nmap -p ${i} 172.18.1.5 >> out.txt; done --> sudo nmap -sP -n 172.18.1.0
    0:02:02 - sudo nmap -sP -n 172.18.1.0                                  --> sudo nmap -sP -n 172.18.1.0/24
    0:00:59 - sudo nmap -sP -n 172.18.1.0/24                               --> map -p  172.18.1.5
    0:00:36 - nmap -p  172.18.1.5                                          --> for i in {65535..1}; do nmap -p ${i} 172.18.1.5 >> out.txt; done
    0:00:32 - sudo nmap -sP -n -F 172.18.1.0/24                            --> sudo nmap -sP -n 172.18.1.5
    0:01:36 - nmap -p- 172.18.1.5                                          --> nmap -p0-65535 172.18.1.0/24
    0:01:48 - nmap -p0-65535 172.18.1.0/24                                 --> nmap -F 172.18.1.5
    0:00:51 - nmap -F 172.18.1.5                                           --> nmap -p 172.18.1.5
    0:02:04 - nmap -p 172.18.1.5                                           --> nmap 172.18.1.5
    0:01:18 - nmap 172.18.1.5                                              --> nmap 172.18.1.5
    0:01:41 - nmap 172.18.1.5                                              --> nmap
    0:02:27 - nmap                                                         --> nmap -sV 172.18.1.5
    0:01:14 - nmap -sV 172.18.1.5                                          --> nmap -p 10000 172.18.1.5
    0:00:50 - man nmap                                                     --> nmap -p-  172.18.1.5
    0:00:22 - nmap -p-  172.18.1.5                                         --> nmap -sV -p 10000 172.18.1.5
    0:02:38 - nmap -sV -p 10000 172.18.1.5                                 --> ~htop
    0:00:26 - ls                                                           --> cd /root
    0:00:32 - cd /root                                                     --> ls Documents/
    0:02:29 - ls Downloads/                                                --> man nmap
    0:02:30 - man nmap                                                     --> python CVE_2019_15107.py https://10.10.20.166:10000 ls
    0:00:44 - python CVE_2019_15107.py https://10.10.20.166:10000 ls       --> python CVE_2019_15107.py https://10.10.20.166:10000
    0:01:29 - python CVE_2019_15107.py https://10.10.20.166:10000          --> man metasploit
    0:00:55 - nmap -p-  172.18.1.5                                         --> metasploit
    0:03:54 - metasploit                                                   --> mv script script.py
    0:03:05 - mv script script.py                                          --> curl -L https://10.10.20.166:10000/password_change.cgi
    0:01:23 - curl -L https://10.10.20.166:10000/password_change.cgi       --> script.py 172.18.1.5:10000 ls
    0:01:59 - python script.py 172.18.1.5:10000 ls                         --> python script.py 172.18.1.5:10000/session_login.cgi ls
    0:00:22 - python script.py 172.18.1.5:10000/session_login.cgi ls       --> python script.py 172.18.1.5:10000 ls
    0:01:17 - python script.py 172.18.1.5:10000 ls                         --> msf
    0:01:42 - msf                                                          --> python script.py 172.18.1.5:10000 id
    0:00:44 - python script.py 172.18.1.5:10000 ls                         --> python script.py https://172.18.1.5:10000 ls
    0:01:36 - python script.py http://172.18.1.5:10000 ls                  --> python script.py http://172.18.1.5:10000 cd /root;ls
    0:00:52 - python script.py http://172.18.1.5:10000 cd /root;ls         --> python script.py http://172.18.1.5:10000 "cd /root;ls"
    0:00:21 - python script.py http://172.18.1.5:10000 "cd /root;ls"       --> msf
    0:00:31 - msf                                                          --> python script.py http://172.18.1.5:10000 "cat *"
    0:01:09 - python script.py http://172.18.1.5:10000 "cat *"             --> python script.py http://172.18.1.5:10000 "ls"
    0:01:01 - python script.py http://172.18.1.5:10000 "ls"                --> msfconsole
    0:00:20 - msfconsole                                                   --> python script.py http://172.18.1.5:10000 "cat CHANGELOG"
    0:00:22 - python script.py http://172.18.1.5:10000 "cat CHANGELOG"     --> python script.py http://172.18.1.5:10000 "ls"
    0:00:39 - python script.py http://172.18.1.5:10000 "ls"                --> python script.py http://172.18.1.5:10000 "twofactor_form.cgi"
    0:00:27 - show exploits                                                --> python script.py http://172.18.1.5:10000 "cat *" > out.txt
    0:01:29 - python script.py http://172.18.1.5:10000 "cat *" > out.txt   --> shooooooooooooow exploiiiiiiiiiiii2
    0:02:03 - shooooooooooooow exploiiiiiiiiiiii2                          --> grep "\d{5}" out.txt
    0:02:52 - msfconsole                                                   --> mv myscript.py
    0:00:46 - mv myscript myscript.py                                      --> ls
    0:05:07 - ls                                                           --> python3 myscript.py
    0:00:42 - python3 myscript.py                                          --> python3 myscript.py
    0:00:32 - python3 myscript.py                                          --> python3 myscript.py
    0:00:28 - python3 myscript.py                                          --> python3 myscript.py

%% Cell type:markdown id:e4218a64 tags:

### 10. Queries for command histories

**Definition**

For a given subset of command logs, compute simpledescriptive statistics (sum, min, max, median, avg, stddev).

**Explanation**

The function `bash_log_statistic` gets parameters as an instance of `ElasticSearchConnection` and two level ids - level_from and level_to. It returns statistics throw all found sandboxes between these levels (included the level itself). The function returns the dictionary with keys - `min`, `max`, `mean`, `median` and `std`. Note that `min` and `max` contain tuples with sandbox_id. The function `show_bash_log_statistic` print the result to the console.

%% Cell type:code id:95af63e3 tags:

``` python
def bash_log_statistic(elk_connection, level_from, level_to):
    events_df = elk_connection.fetch_events()
    logs_df = elk_connection.fetch_bash_logs()
    filtered_logs_df = filter_logs_in_level_range(logs_df, events_df, level_from, level_to)
    logs_stats_df = filtered_logs_df.groupby(['sandbox_id'])['cmd_raw'].count().reset_index()
    min_row = logs_stats_df[logs_stats_df['cmd_raw'] == logs_stats_df['cmd_raw'].min()].reset_index()
    max_row = logs_stats_df[logs_stats_df['cmd_raw'] == logs_stats_df['cmd_raw'].max()].reset_index()
    return {
        'min': (min_row.loc[0]['sandbox_id'], min_row.loc[0]['cmd_raw']),
        'max': (max_row.loc[0]['sandbox_id'], max_row.loc[0]['cmd_raw']),
        'mean': logs_stats_df['cmd_raw'].mean(),
        'median': logs_stats_df['cmd_raw'].median(),
        'std': logs_stats_df['cmd_raw'].std(),
    }

def show_bash_log_statistic(elk_connection, level_from, level_to):
    stats = bash_log_statistic(elk_connection, level_from, level_to)
    print('STATISTICS FOR BASH LOGS \n')
    print('  {:10} {} cmds (sandbox {})'.format('MIN', stats['min'][1], stats['min'][0]))
    print('  {:10} {} cmds (sandbox {})'.format('MAX', stats['max'][1], stats['max'][0]))
    print('  {:10} {} cmds'.format('MEAN', stats['mean']))
    print('  {:10} {} cmds'.format('MEDIAN', stats['median']))
    print('  {:10} {:.2f} cmds'.format('STD', stats['std']))
```

%% Cell type:code id:77881922 tags:

``` python
print(bash_log_statistic(esc, 61, 62))
show_bash_log_statistic(esc, 61, 62)
```

%% Output

    {'min': (129, 6), 'max': (131, 17), 'mean': 9.5, 'median': 7.5, 'std': 5.066228051190222}
    STATISTICS FOR BASH LOGS
    
      MIN        6 cmds (sandbox 129)
      MAX        17 cmds (sandbox 131)
      MEAN       9.5 cmds
      MEDIAN     7.5 cmds
      STD        5.07 cmds
+226 −0

File added.

Preview size limit exceeded, changes collapsed.