/usr/src/file_protector-1.1-1507/transport/ Current File: //usr/src/file_protector-1.1-1507/transport/transport.c
/**
@file
@brief    Message transport between kernel and userspace
@details  Copyright (c) 2017-2021 Acronis International GmbH
@author   Mikhail Krivtsov (mikhail.krivtsov@acronis.com)
@since    $Id: $
*/

#include "transport.h"

#include "compat.h"
#include "debug.h"
#include "device.h"
#include "memory.h"
#include "message.h"
#include "ring.h"
#include "set.h"
#include "syscall_common.h"
#include "task_info_map.h"
#include "tracepoints.h"

#include <asm/io.h>
#include <linux/fs.h>
#include <linux/list.h>
#include <linux/jiffies.h>	// msecs_to_jiffies()
#include <linux/mutex.h>
#include <linux/sched.h>
#include <linux/spinlock.h>
#include <linux/uaccess.h>	// copy_from_user(), copy_to_user()
#include <linux/wait.h>		// wait_event*(), wake_up*()

#define TRANSPORT_MSG_SIZE_MAX (1<<10)
#define TRANSPORT_QUEUE_CAPACITY (0x1000 / sizeof(msg_t *))
#define TRANSPORT_WAIT_REPLY_TIMEOUT_MSECS (60*1000)
#define TRANSPORT_WAIT_RUNDOWN_TIMEOUT_MSECS (5*1000)

#define TRANSPORT_PRINTF(format, args...) DPRINTF(format, ##args)

// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

#define DATA_QUEUE_HEADER_SIZE sizeof(shared_data_queue_t)
#define DATA_QUEUE_ENTRY_HEADER_SIZE sizeof(data_queue_entry_t)

typedef struct {
	struct list_head transport_list_node;

	pid_t control_tgid;
	atomic64_t events_mask;

	// FIXME: Use 'msg_wait_queue.lock' instead of 'msg_spinlock'
	// #define msg_spinlock msg_wait_queue.lock
	spinlock_t msg_spinlock;
	wait_queue_head_t msg_wait_queue;
	bool shutdown;
	ring_t msg_ring;
	// sent messages waiting for 'reply'
	set_t sent_msgs_set;
	uint32_t queue_size;
	shared_data_queue_t *queue;
	atomic_t queue_event;
} transport_t;

typedef struct {
	struct mutex transport_mutex;
	unsigned transport_count;

	atomic64_t combined_events_mask;

	spinlock_t transport_spinlock;
	struct list_head transport_list;
	msg_id_t msg_id_sequence;
} transport_global_t;

static transport_global_t transport_global;

// Virtual memory allocators that are suitable for 'mmap'
static void *virt_mem_alloc(size_t size)
{
	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP | __GFP_NORETRY;
	return (void *) __get_free_pages(gfp_flags, get_order(size));
}

static void virt_mem_free(void *ptr, size_t size)
{
	struct page *page;

	if (!ptr)
		return;

	page = virt_to_head_page(ptr);
	__free_pages(page, get_order(size));
}

static void transport_global_init(void)
{
	mutex_init(&transport_global.transport_mutex);
	transport_global.transport_count = 0;
	spin_lock_init(&transport_global.transport_spinlock);
	INIT_LIST_HEAD(&transport_global.transport_list);
	transport_global.msg_id_sequence = 0;
}

static void transport_global_register(transport_t *transport)
{
	spin_lock(&transport_global.transport_spinlock);
	list_add_tail(&transport->transport_list_node, &transport_global.transport_list);
	spin_unlock(&transport_global.transport_spinlock);
}

static void transport_global_unregister(transport_t *transport)
{
	spin_lock(&transport_global.transport_spinlock);
	if (!list_empty(&transport->transport_list_node)) {
		list_del_init(&transport->transport_list_node);
	}
	spin_unlock(&transport_global.transport_spinlock);
}

static bool transport_is_control_tgid_impl(pid_t tgid)
{
	transport_t *transport;
	bool is_control_tgid = false;
	list_for_each_entry(transport, &transport_global.transport_list, transport_list_node) {
		if (tgid == transport->control_tgid) {
			is_control_tgid = true;
			break;
		}
	}
	return is_control_tgid;
}

static bool transport_is_control_tgid(pid_t tgid)
{
	bool is_control_tgid;
	spin_lock(&transport_global.transport_spinlock);
	is_control_tgid = transport_is_control_tgid_impl(tgid);
	spin_unlock(&transport_global.transport_spinlock);
	return is_control_tgid;
}

static msg_id_t transport_global_sequence_next_impl(void)
{
	msg_id_t msg_id = ++transport_global.msg_id_sequence;
	if (!msg_id) {
		msg_id = ++transport_global.msg_id_sequence;
	}
	DPRINTF("msg_id=%llX", msg_id);
	return msg_id;
}

static msg_id_t transport_global_sequence_next(void)
{
	// TODO DK: use 'atomic64' instead of spinlock
	msg_id_t msg_id;
	spin_lock(&transport_global.transport_spinlock);
	msg_id = transport_global_sequence_next_impl();
	spin_unlock(&transport_global.transport_spinlock);
	return msg_id;
}

static void transport_global_recalculate_combined_event_mask_impl(void)
{
	transport_t *transport;
	uint64_t combined_mask = 0;
	list_for_each_entry(transport, &transport_global.transport_list, transport_list_node) {
		barrier();
		combined_mask |= atomic64_read(&transport->events_mask);
		barrier();
	}

	barrier();
	atomic64_set(&transport_global.combined_events_mask, combined_mask);
}

static void transport_global_recalculate_combined_event_mask(void)
{
	spin_lock(&transport_global.transport_spinlock);
	transport_global_recalculate_combined_event_mask_impl();
	spin_unlock(&transport_global.transport_spinlock);
}

uint64_t transport_global_get_combined_mask(void)
{
	return atomic64_read(&transport_global.combined_events_mask);
}

static void drop_msgs_impl(ring_t *ring)
{
	while (!ring_is_empty(ring)) {
		msg_t *msg = *(msg_t **) ring_consumer_ptr(ring);
		msg_unref(msg);
		ring_consumer_index_move_one(ring);
	}
}

/*
    'msg ref/unref' for messages stored in 'sent_msgs_set' are invoked in
    'msg_reply_wait_count inc/dec'.
    There is no need for separate 'msg ref/unref' calls.
*/
static void drop_sent_msgs_impl(set_t *set)
{
	void *item_ptr = set_begin_ptr(set);
	void *end_ptr = set_end_ptr(set);
	while (item_ptr < end_ptr) {
		msg_t *msg = *(msg_t **) item_ptr;
		msg_reply_wait_count_dec(msg);
		item_ptr = set_ptr_next(set, item_ptr);
	}
	set->count = 0;
}

static void transport_shutdown(transport_t *transport)
{
	DPRINTF("transport=%p", transport);
	spin_lock(&transport->msg_spinlock);
	{
		atomic64_set(&transport->events_mask, 0);
		transport->shutdown = true;

		// Discard undelivered messages
		drop_msgs_impl(&transport->msg_ring);

		// Discard messages waiting for 'reply'
		drop_sent_msgs_impl(&transport->sent_msgs_set);
	}
	spin_unlock(&transport->msg_spinlock);

	// wakeup all userspace 'read' waiters
	wake_up_all(&transport->msg_wait_queue);
}

// identify and shutdown transport failed to reply
static void transport_shutdown_msg(transport_t *transport, msg_t *unreplied_msg)
{
	bool found = false;

	DPRINTF("transport=%p unreplied_msg=%p", transport, unreplied_msg);

	spin_lock(&transport->msg_spinlock);
	{
		void *item_ptr = set_begin_ptr(&transport->sent_msgs_set);
		void *end_ptr = set_end_ptr(&transport->sent_msgs_set);
		while (item_ptr < end_ptr) {
			if (unreplied_msg == *(msg_t **) item_ptr) {
				found = true;
				break;
			}
			item_ptr = set_ptr_next(&transport->sent_msgs_set, item_ptr);
		}
	}
	spin_unlock(&transport->msg_spinlock);

	if (found) {
		WPRINTF("deativating transport on reply wait timeout");
		transport_shutdown(transport);
	}
}

// identify and shutdown transport failed to reply
static void transport_global_shutdown_msg(msg_t *unreplied_msg)
{
	DPRINTF("unreplied_msg=%p", unreplied_msg);
	spin_lock(&transport_global.transport_spinlock);
	{
		transport_t *transport;
		transport_t *next_transport;
		list_for_each_entry_safe(transport, next_transport,
				&transport_global.transport_list,
				transport_list_node) {
			transport_shutdown_msg(transport, unreplied_msg);
		}
	}
	spin_unlock(&transport_global.transport_spinlock);
}

static void transport_free(transport_t *transport)
{
	DPRINTF("transport=%p", transport);
	transport_global_unregister(transport);
	transport_shutdown(transport);

	IPRINTF("message queue items_count_max=%u capacity=%u",
			ring_items_count_max(&transport->msg_ring),
			ring_capacity(&transport->msg_ring));
	IPRINTF("sent_msgs_set items_count_max=%u capacity=%u",
			set_items_count_max(&transport->sent_msgs_set),
			set_capacity(&transport->sent_msgs_set));

	mem_free(ring_buffer(&transport->msg_ring));
	mem_free(set_buffer(&transport->sent_msgs_set));

	if (transport->queue) {
		virt_mem_free(transport->queue, transport->queue_size + DATA_QUEUE_HEADER_SIZE);
	}

	mem_free(transport);
}

static bool transport_ring_init(ring_t *ring)
{
	size_t buffer_size = TRANSPORT_QUEUE_CAPACITY * sizeof(msg_t *);
	msg_t **msgs;
	bool success;
	if (!buffer_size) {
		msgs = NULL;
		success = true;
	} else {
		msgs = mem_alloc0(buffer_size);
		success = (bool) msgs;
	}
	ring_init(ring, msgs, buffer_size, sizeof(msg_t *));
	return success;
}

static bool transport_set_init(set_t *set)
{
	size_t buffer_size = TRANSPORT_QUEUE_CAPACITY * sizeof(msg_t *);
	msg_t **msgs;
	bool success;
	if (!buffer_size) {
		msgs = NULL;
		success = true;
	} else {
		msgs = mem_alloc0(buffer_size);
		success = (bool) msgs;
	}
	set_init(set, msgs, buffer_size, sizeof(msg_t *));
	return success;
}

// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

// Shared with userspace Data Queue implementation

// Some defines for compiler atomic hacks. We only care about x86_64 here, on other platforms those are NOT valid.
// Other platforms must provider their implementations for 'smp_*' & 'READ/WRITE_ONCE'
// Generally those convenience memory ordering functions are available in Linux 3.12 & Linux 3.18.
// I assume it will not be necessary to support any other architectures other than x86_64 on kernels that old.
#ifndef READ_ONCE
#define __READ_ONCE_SIZE						\
({									\
	switch (size) {							\
	case 1: *(__u8 *)res = *(volatile __u8 *)p; break;		\
	case 2: *(__u16 *)res = *(volatile __u16 *)p; break;		\
	case 4: *(__u32 *)res = *(volatile __u32 *)p; break;		\
	case 8: *(__u64 *)res = *(volatile __u64 *)p; break;		\
	default:							\
		barrier();						\
		__builtin_memcpy((void *)res, (const void *)p, size);	\
		barrier();						\
	}								\
})

static void _do_read_once_size(const volatile void *p, void *res, int size)
{
	__READ_ONCE_SIZE;
}

#define READ_ONCE(x)						\
({									\
	union { typeof(x) __val; char __c[1]; } __u;			\
	_do_read_once_size(&(x), __u.__c, sizeof(x));		\
	smp_read_barrier_depends(); /* Enforce dependency ordering from x */ \
	__u.__val;							\
})
#endif

#ifndef WRITE_ONCE
static void _do_write_once_size(volatile void *p, void *res, int size)
{
	switch (size) {
	case 1: *(volatile __u8 *)p = *(__u8 *)res; break;
	case 2: *(volatile __u16 *)p = *(__u16 *)res; break;
	case 4: *(volatile __u32 *)p = *(__u32 *)res; break;
	case 8: *(volatile __u64 *)p = *(__u64 *)res; break;
	default:
		barrier();
		__builtin_memcpy((void *)p, (const void *)res, size);
		barrier();
	}
}

#define WRITE_ONCE(x, val) \
({							\
	union { typeof(x) __val; char __c[1]; } __u =	\
		{ .__val = (__force typeof(x)) (val) }; \
	_do_write_once_size(&(x), __u.__c, sizeof(x));	\
	__u.__val;					\
})
#endif

#ifndef smp_store_release
#define smp_store_release(p, v)						\
do {									\
	barrier();							\
	WRITE_ONCE(*p, v);						\
} while (0)
#endif

#ifndef smp_load_acquire
#define smp_load_acquire(p)						\
({									\
	typeof(*p) ___p1 = READ_ONCE(*p);				\
	barrier();							\
	___p1;								\
})
#endif

#define DATA_QUEUE_ENTRY_AT(queue, v) (data_queue_entry_t*)((uint8_t *)queue->entries + v)

#ifdef ROUND_UP
#undef ROUND_UP
#endif

#define ROUND_UP(N, S) ((((N) + (S) - 1) / (S)) * (S))

static inline void data_queue_write_new_entry(shared_data_queue_t *queue, uint32_t offset, void *data, uint32_t data_size)
{
	data_queue_entry_t *entry = DATA_QUEUE_ENTRY_AT(queue, offset);
	// Technically this 'WRITE_ONCE' is useless but let's follow the rules
	WRITE_ONCE(entry->size, data_size);
	// Not protecting the memcpy with 'barrier' call, we will use 'smp_*' later anyways
	memcpy(&entry->data, data, data_size);
}

// This function is called from data queue 'writer' under the spin_lock as it is NOT thread-safe
// As such, reads from 'queue->tail' can be done use 'READ_ONCE' (relaxed), writes must be done using 'smp_store_release'.
// 'reader' may decide to alter 'queue->head' so 'smp_load_acquire' must be used to read it, writes are NOT allowed.
static bool transport_shared_data_queue_enqueue_impl(transport_t *transport, void *data, uint32_t data_size)
{
	uint32_t head, tail, new_tail;
	// 'entry_size' is the size of the whole 'data_queue_entry' including the header
	// in the 'data_queue_entry' 'data_size' is written instead to avoid useless checks.
	uint32_t entry_size = data_size + DATA_QUEUE_ENTRY_HEADER_SIZE;
	shared_data_queue_t *queue = transport->queue;
	uint32_t queue_size = transport->queue_size;

	// Notice that we are not doing any memory shenanigans here to load tail & head.
	// The barriers will be done later if it will appear that they are necessary.
	// !!! 'head' might not be synchronized with 'reader', it is OK and will be handled in the end.
	tail = READ_ONCE(queue->tail);
	head = smp_load_acquire(&queue->head);

	// Check for unreasonable 'tail' or 'head', must never happen.
	if (queue_size < tail || queue_size < head) {
		WPRINTF("Invalid tail/head detected: tail=%u, head=%u, size=%u"
					, (unsigned) tail, (unsigned) head, (unsigned) queue_size);
		return false;
	}

	// Start inserting the contents of 'data' in the shared data queue
	if (tail >= head) {
		// Tail is further than head, it is a regular scenario. Handle it
		//      head          tail
		//      V             V
		// -----|*************|-----------------
		//             ^                ^
		//     data to be dequeued      |
		//                          free space
		if ((tail + entry_size) <= queue_size) {
			// There is enough buffer in the 'tail' of the queue, write the entry and move the tail
			//      head          tail    new_tail
			//      V             V       V
			// -----|*************|+++++++|-------
			//                        ^
			//                     new entry
			data_queue_write_new_entry(queue, tail /*off*/, data, data_size);
			new_tail = tail + entry_size;
		} else if (head > entry_size) {
			// As first condition did not satisfy, cannot put data after 'tail'
			// Have to loop back to the start and there is enough space before userspace 'head'
			//                      head         tail
			//                      V            V
			// |++++++|------------|*************|?? <- zapped entry w/ size>queue_size-tail, if fits
			// ^      ^
			// off    new_tail

			// Need to explain the userspace that current entry is too long to fit.
			// If there is not enough space to even place a entry header, do nothing.
			// Otherwise, deliberately zap the entry by putting 'data_size' that is too big.
			if ((queue_size - tail) >= DATA_QUEUE_ENTRY_HEADER_SIZE) {
				data_queue_entry_t *entry_to_zap = DATA_QUEUE_ENTRY_AT(queue, tail);
				entry_to_zap->size = data_size;
				// do not touch 'entry_to_zap->data', it is bogus. entry just says go to the start
			}

			// Write data at the beginning of the queue
			data_queue_write_new_entry(queue, 0 /*off*/, data, data_size);
			new_tail = /*off==0 + */ entry_size;
		} else {
			// There is neither enough space after 'tail' nor before 'head', bail
			WPRINTF("No more space is left");
			return false;
		}
	} else {
		// Catching to the 'head' from the other size.
		//     tail         head
		//     V            V
		// ****|--------------|***************

		// Insert can still be done if 'head' will not be overrun
		if ((head - tail) > entry_size) {
			//     tail         head
			//     V            V
			// ****|+++++|------|***************
			//           ^
			//           new_tail
			data_queue_write_new_entry(queue, tail, data, data_size);
			new_tail = tail + entry_size;
		} else {
			// There is not enough space not to overrun 'head', bail
			WPRINTF("No more space is left");
			return false;
		}
	}

	// Expose all the content written in this thread as per 'release' semantics.
	// Reader must do 'smp_store_acquire' on the same variable ('tail') to see 'entries' written.
	// !!! This logic does NOT enforce 'tail' in 'reader' to be equal to 'tail' in 'writer'
	new_tail = ROUND_UP(new_tail, sizeof(uint32_t));
	smp_store_release(&queue->tail, new_tail);

	// The new tail was published to the 'queue' but is it necessary to notify the 'reader'?
	// If in the beginning 'tail == head', it means that userspace has finished reading all the
	// content and is going to wait or is already waiting for the 'event'.
	// In such case it is clear that will must notify the 'reader' no matter what.
	// Moreover 'reader' cannot move the 'head' past the 'tail' so it is guaranteed that it is
	// indeed the latest published 'head' by the reader.

	if (tail != head) {
		// If 'tail != head', it is not as clear. If it so happened that userspace moved the 'head'
		// to be equal to 'tail' while 'writer' was adding the new entry, 'reader' will go 'wait'.
		// So we must refresh the 'head' to ensure we actually do not need to wakeup the 'reader'.
		// The other situation is also valid - 'writer' might delay writes to the head as 'atomic' ops.
		// We need to make sure userspace will continue consuming events as we wrote the 'tail'.
		// Whenever userspace will detect that its current 'tail==head', it will perform 'smb_mb'
		// to fetch the new 'tail' we just wrote to ensure it does not need to consume anymore.
		smp_mb();
		head = READ_ONCE(queue->head);
	}

	if (tail == head) {
		// atomic_ops.rst: atomic_read() and atomic_set() DO NOT IMPLY BARRIERS!
		atomic_set(&transport->queue_event, 1);

		// The data queue was empty, wakeup the 'reader' which is waiting for us.
		// Use 'smp_wmb' to make sure 'tail' that we stored will be seen by the user.
		// It is also fine if we did 'smp_mb' before, we will pair with 'smb_rmb' just fine.
		// Also using 'smp_wmb' to ensure 'atomic_set' did set the 'queue_event'.
		smp_wmb();

		wake_up_interruptible(&transport->msg_wait_queue);
		TRANSPORT_PRINTF("woken up ht=%u nt=%u", tail, new_tail);
	}

	return true;
}

static long transport_queue_events_available(transport_t *transport)
{
	uint32_t tail, head;
	int ev;
	shared_data_queue_t *queue = READ_ONCE(transport->queue);

	smp_rmb();
	ev = atomic_xchg(&transport->queue_event, 0);
	if (ev) {
		TRANSPORT_PRINTF("check ev active");
		return 1;
	}

	// This should not be necessary but doing it just in case.
	tail = READ_ONCE(queue->tail);
	head = READ_ONCE(queue->head);
	TRANSPORT_PRINTF("check h=%u t=%u", head, tail);

	return transport->shutdown || (head != tail);
}

// This function is called whenever userspace 'reader' deemed that there are no more events to read.
// It will be waiting for the data queue to gain new content using 'msg_wait_queue'.
// 'wake_up_interruptible' does 'wakeup' when it detects that the queue is being empty.
static long transport_data_queue_wait(transport_t *transport)
{
	shared_data_queue_t *queue = READ_ONCE(transport->queue);
	long ret;

	task_info_map_delete_exited();

	if (!queue) {
		return -EINVAL;
	}

	if (wait_event_interruptible_exclusive(transport->msg_wait_queue, transport_queue_events_available(transport))) {
		ret = -EINTR;
	} else {
		if (transport->shutdown) {
			ret = -EIO;
		} else {
			ret = 0;
		}
	}

	return ret;
}

static int transport_data_queue_mmap(transport_t *transport, struct vm_area_struct *vma)
{
	loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT;
	unsigned long sz = vma->vm_end - vma->vm_start;
	unsigned long pfn;
	struct page *page;
	void *ptr;
	if (0 != offset) {
		return -EINVAL;
	}

	ptr = READ_ONCE(transport->queue);
	if (!ptr) {
		return -EINVAL;
	}

	page = virt_to_head_page(ptr);
	if (sz > (PAGE_SIZE << compound_order(page))) {
		return -EINVAL;
	}

	pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
}

static long data_queue_create(const data_queue_params_t *params, shared_data_queue_t **pqueue)
{
	shared_data_queue_t *queue;
	uint32_t size = params->size;

	if (size <= DATA_QUEUE_HEADER_SIZE)
		return -EINVAL;

	queue = (shared_data_queue_t*) virt_mem_alloc(size);
	if (!queue)
		return -ENOMEM;

	*pqueue = queue;
	return 0;
}

// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

static bool transport_send_msg_nowait(transport_t *transport, msg_t *msg)
{
	bool need_wakeup = false;
	msg_type_t mt = MSG_TYPE(msg);
	if (mt >= MT_FIRST_ACTIVITY_EVENT
	&& !(atomic64_read(&transport->events_mask) & MSG_TYPE_TO_EVENT_MASK(mt))) {
		return false;
	}

	spin_lock(&transport->msg_spinlock);
	{
		if (transport->shutdown) {
			spin_unlock(&transport->msg_spinlock);
			return false;
		}

		if (msg_img_is_reply_required(MSG_IMG(msg))) {
			unsigned item_index;
			if (set_is_full(&transport->sent_msgs_set)) {
				WPRINTF("'sent_msgs_set' overflow (capacity=%u)", set_capacity(&transport->sent_msgs_set));
				spin_unlock(&transport->msg_spinlock);
				transport_shutdown(transport);
				return false;
			}
			item_index = set_items_count(&transport->sent_msgs_set);
			/*
			    'msg ref/unref' for messages stored in 'sent_msgs_set' are invoked in
			    'msg_reply_wait_count inc/dec'.
			    There is no need for separate 'msg ref/unref' calls.
			*/
			*(msg_t **) set_item_ptr(&transport->sent_msgs_set, item_index) = msg_reply_wait_count_inc(msg);
			set_items_count_set(&transport->sent_msgs_set, item_index + 1);
		}

		if (transport->queue) {
			need_wakeup = false;
			if (!transport_shared_data_queue_enqueue_impl(transport, MSG_IMG(msg), MSG_SIZE(msg))) {
				WPRINTF("mmaped queue overflow");
				spin_unlock(&transport->msg_spinlock);
				transport_shutdown(transport);
				return false;
			}
		} else {
			need_wakeup = true;
			if (ring_is_full(&transport->msg_ring)) {
				WPRINTF("message queue overflow (capacity=%u)", ring_capacity(&transport->msg_ring));
				spin_unlock(&transport->msg_spinlock);
				transport_shutdown(transport);
				return false;
			}

			*(msg_t **) ring_producer_ptr(&transport->msg_ring) = msg_ref(msg);
			ring_producer_index_move_one(&transport->msg_ring);
		}
	}
	spin_unlock(&transport->msg_spinlock);

	if (need_wakeup) {
		// wakeup userspace reader
		wake_up_interruptible_sync(&transport->msg_wait_queue);
	}

	return true;
}

static bool transport_send_hello_nowait(transport_t *transport)
{
	msg_t *msg = hello_msg_new();
	bool success;
	if (!msg) {
		success = false;
	} else {
		success = transport_send_msg_nowait(transport, msg);
		msg_unref(msg);
	}
	return success;
}

static bool send_msg_nowait(msg_t *msg)
{
	bool sent = false;
	spin_lock(&transport_global.transport_spinlock);
	if (!transport_is_control_tgid_impl(current->tgid)) {
		transport_t *transport;
		transport_t *next_transport;
		list_for_each_entry_safe(transport, next_transport,
				&transport_global.transport_list,
				transport_list_node) {
			sent |= transport_send_msg_nowait(transport, msg);
		}
	}
	spin_unlock(&transport_global.transport_spinlock);
	return sent;
}

static transport_t *transport_new(void)
{
	transport_t *transport = mem_alloc0(sizeof(transport_t));
	if (transport) {
		INIT_LIST_HEAD(&transport->transport_list_node);

		// remember client's process doing 'open' to auto-ignore it
		transport->control_tgid = current->tgid;

		spin_lock_init(&transport->msg_spinlock);
		init_waitqueue_head(&transport->msg_wait_queue);
		atomic64_set(&transport->events_mask, 0);
		transport->shutdown = false;
		transport->queue = NULL;
		atomic_set(&transport->queue_event, 0);

		if (transport_ring_init(&transport->msg_ring)
		&& transport_set_init(&transport->sent_msgs_set)
		&& 0 == task_info_status_set(current->tgid, TS_IGNORE)
		&& transport_send_hello_nowait(transport)) {
			transport_global_register(transport);
		} else {
			transport_free(transport);
			transport = NULL;
		}
	}
	DPRINTF("transport=%p", transport);
	return transport;
}

int __init transport_mod_init(void)
{
	int ret;

	transport_global_init();

	ret = device_mod_init();
	if (ret) {
		EPRINTF("'device_mod_init()' failure %i", ret);
	}
	return ret;
}

void transport_mod_down(void)
{
	DPRINTF("");
	device_mod_down();
	DPRINTF("");
}

static msg_t *transport_lookup_msg_ref(transport_t *transport, msg_id_t reply_id) {
	msg_t* msg = NULL;
	DPRINTF("");
	// TODO DK: Is it possible to use radix tree here instead?
	spin_lock(&transport->msg_spinlock);
	{
		void *item_ptr = set_begin_ptr(&transport->sent_msgs_set);
		void *end_ptr = set_end_ptr(&transport->sent_msgs_set);
		while (item_ptr < end_ptr) {
			msg_t *query = *(msg_t **) item_ptr;
			if (MSG_ID(query) == reply_id) {
				msg = query;
				msg_ref(msg);
				goto unlock;
			}
			item_ptr = set_ptr_next(&transport->sent_msgs_set, item_ptr);
		}
	}
unlock:
	spin_unlock(&transport->msg_spinlock);
	DPRINTF("ret=%p", msg);
	return msg;
}

// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

static long transport_ioctl_pid_info(msg_varsized_t *reply_msg, msg_sized_t *query_msg)
{
	long ret;

	if (query_msg->img_size < (sizeof(msg_img_t) + sizeof(get_pid_info_img_t))) {
		EPRINTF("'%s' message is too short", action_type_to_string(MSG_TYPE(query_msg)));
		ret = -EINVAL;
	} else {
		msg_img_t *msg_img = MSG_IMG(query_msg);
		get_pid_info_img_t *img = IMG_PAYLOAD(msg_img);
		pid_t pid = img->pid;
		ret = pid_info_return_msg_new(reply_msg, pid);
	}
	DPRINTF("ret=%li", ret);
	return ret;
}

static long transport_ioctl_fs_root(msg_varsized_t *reply_msg, msg_sized_t *query_msg)
{
	long ret;

	if (query_msg->img_size < (sizeof(msg_img_t) + sizeof(get_fs_root_img_t))) {
		EPRINTF("'%s' message is too short", action_type_to_string(MSG_TYPE(query_msg)));
		ret = -EINVAL;
	} else {
		msg_img_t *msg_img = MSG_IMG(query_msg);
		get_fs_root_img_t *img = IMG_PAYLOAD(msg_img);
		pid_t pid = img->pid;
		ret = fs_root_return_msg_new(reply_msg, pid);
	}
	DPRINTF("ret=%li", ret);
	return ret;
}

static long install_file(struct file *file, int fd, int *pfd)
{
	if (IS_ERR(file)) {
		put_unused_fd(fd);
		return PTR_ERR(file);
	}

	// file is consumed so no need to call 'fput'
	fd_install(fd, file);
	*pfd = fd;

	return 0;
}

static long open_file_with_flags(const char *full_path, int uflags, int mode, int *pfd)
{
	struct file *file;
	int flags;
	int fd;

	DPRINTF("Opening file '%s', uflags '%d', mode '%d'", full_path, uflags, mode);
	fd = get_unused_fd_compat();
	if (fd < 0) {
		return fd;
	}

	flags = uflags
#ifdef O_LARGEFILE
	      | O_LARGEFILE
#endif
#ifdef O_NOATIME
	      | O_NOATIME
#endif
// 'FMODE_NONOTIFY' refers to 'fanotify' not scanning the file.
#ifdef FMODE_NONOTIFY
	      | FMODE_NONOTIFY
#endif
	;

	file = filp_open(full_path, flags, mode);
	return install_file(file, fd, pfd);
}

static inline long open_file(struct path *path, int *pfd)
{
	struct file *file;
	int flags;
	int fd;

	if (!path->dentry && !path->mnt) {
		return -ENOENT;
	}

	fd = get_unused_fd_compat();
	if (fd < 0) {
		return fd;
	}

	flags = O_RDONLY
#ifdef O_LARGEFILE
	      | O_LARGEFILE
#endif
#ifdef O_NOATIME
	      | O_NOATIME
#endif
// 'FMODE_NONOTIFY' refers to 'fanotify' not scanning the file.
#ifdef FMODE_NONOTIFY
	      | FMODE_NONOTIFY
#endif
	;

	file = dentry_open_compat(path, flags);
	if (IS_ERR(file)) {
		// If open failed, let's try to open via the path.
		// Notice that this open will be inside 'client' service context
		// so this 'filp_open' has a good chance of failing as
		// 'mount namespaces' might be different in the process.
		// Perhaps a proper solution would be opening the file inside the original
		// process context, but that would result in having to create 'file'
		// early or to do extra context switches to the scanned process.
		// Either way seems inefficient so it is currently avoided.
		size_t size = PAGE_SIZE;
		char *buf = mem_alloc(size);
		const char *full_path;
		if (!buf) {
			return -ENOMEM;
		}

		full_path = d_path(path, buf, size);
		if (!IS_ERR(full_path)) {
			file = filp_open(full_path, flags, 0);
		}
		mem_free(buf);
	}

	return install_file(file, fd, pfd);
}

static long transport_ioctl_handle_open_file_from_msg(transport_t *transport, msg_varsized_t *reply_msg, msg_sized_t *query_msg)
{
	long ret;
	msg_t* msg;
	msg_img_t *msg_img = MSG_IMG(query_msg);
	open_file_from_msg_img_t *img = IMG_PAYLOAD(msg_img);

	if (MSG_SIZE(query_msg) < sizeof(msg_img_t) + sizeof(open_file_from_msg_img_t))
		return -EINVAL;

	msg = transport_lookup_msg_ref(transport, MSG_ID(query_msg));
	if (!msg) {
		ret = -EINVAL;
	} else {
		int fd = -1;
		struct path path;

		thread_safe_path_load(img->num == 0 ? &msg->path : &msg->path2, &path);
		msg_unref(msg);
		ret = open_file(&path, &fd);
		path_put(&path);
		if (0 == ret) {
			ret = open_file_return_msg_new(reply_msg, fd);
		}
	}

	return ret;
}

static long transport_ioctl_handle_open_file_by_path(transport_t *transport, msg_varsized_t *reply_msg, msg_sized_t *query_msg)
{
	long ret;
	char *path;
	size_t pathSize;
	int fd = -1;
	msg_img_t *msg_img = MSG_IMG(query_msg);
	open_file_by_path_img_t *img = IMG_PAYLOAD(msg_img);

	if (MSG_SIZE(query_msg) <= sizeof(msg_img_t) + sizeof(open_file_by_path_img_t))
		return -EINVAL;

	path = img->path;
	pathSize = MSG_SIZE(query_msg) - (sizeof(msg_img_t) + sizeof(open_file_by_path_img_t));
	path[pathSize - 1] = '\0';
	ret = open_file_with_flags(path, img->flags, img->mode, &fd);
	if (0 == ret) {
		ret = open_file_return_msg_new(reply_msg, fd);
	}

	return ret;
}

static long transport_ioctl_handle_get_version(msg_varsized_t *reply_msg)
{
	return version_info_return_msg_new(reply_msg);
}

static long transport_ioctl_handle_data_queue_init(transport_t *transport, msg_varsized_t *reply_msg, msg_sized_t *query_msg)
{
	msg_img_t *msg_img = MSG_IMG(query_msg);
	data_queue_params_t *params = IMG_PAYLOAD(msg_img);
	long err;
	shared_data_queue_t *queue;
	uint32_t queue_size;

	if (MSG_SIZE(query_msg) < sizeof(msg_img_t) + sizeof(data_queue_params_t))
		return -EINVAL;

	err = data_queue_create(params, &queue);
	if (err) {
		return err;
	}

	queue_size = params->size - DATA_QUEUE_HEADER_SIZE;
	{
		spin_lock(&transport->msg_spinlock);
		if (transport->queue) {
			spin_unlock(&transport->msg_spinlock);
			virt_mem_free(queue, params->size);
			return -EEXIST;
		}

		transport->queue = queue;
		transport->queue_size = queue_size;
		spin_unlock(&transport->msg_spinlock);
	}

	return data_queue_offsets_return_msg_new(reply_msg, queue_size);
}

static long transport_ioctl_write_read_msg(transport_t *transport, msg_varsized_t *reply_msg, msg_sized_t *query_msg)
{
	long ret;
	action_type_t action_type;

	if (MSG_REPLY(query_msg)) {
		EPRINTF("'reply' ioctl is not supported");
		ret = -EINVAL;
		goto out;
	}

	action_type = MSG_TYPE(query_msg);
	switch (action_type) {
	case AT_GET_PID_INFO:
		ret = transport_ioctl_pid_info(reply_msg, query_msg);
		break;

	case AT_GET_FS_ROOT:
		ret = transport_ioctl_fs_root(reply_msg, query_msg);
		break;

	case AT_OPEN_FILE_FROM_MSG:
		ret = transport_ioctl_handle_open_file_from_msg(transport, reply_msg, query_msg);
		break;

	case AT_OPEN_FILE_BY_PATH:
		ret = transport_ioctl_handle_open_file_by_path(transport, reply_msg, query_msg);
		break;

	case AT_GET_VERSION:
		ret = transport_ioctl_handle_get_version(reply_msg);
		break;

	case AT_INIT_SHARED_DATA_QUEUE:
		ret = transport_ioctl_handle_data_queue_init(transport, reply_msg, query_msg);
		break;

	default:
		EPRINTF("Unexpected '%s' message", action_type_to_string(action_type));
		HEX_DUMP("query_msg: ", MSG_IMG(query_msg), MSG_SIZE(query_msg));
		ret = -EINVAL;
		break;
	}
out:
	DPRINTF("ret=%li", ret);
	return ret;
}

static long transport_ioctl_copy_from_user(ioctl_hdr_t *ioctl_hdr,
		msg_varsized_t *query_msg, void __user *user_data)
{
	long ret;

	size_t msg_size;
	msg_sized_t *msg;
	msg_img_t *msg_img;
	void *payload;

	if (copy_from_user(ioctl_hdr, user_data, sizeof(ioctl_hdr_t))) {
		EPRINTF("'copy_from_user()' failure");
		ret = -EFAULT;
		goto out;
	}
	msg_size = ioctl_hdr->size;
	if (msg_size < sizeof(msg_img_t)) {
		EPRINTF("message image is too small");
		ret = -EINVAL;
		goto out;
	}
	if (msg_size > TRANSPORT_MSG_SIZE_MAX) {
		EPRINTF("size > TRANSPORT_MSG_SIZE_MAX");
		ret = -E2BIG;
		goto out;
	}
	msg = msg_varsized_init(query_msg, msg_size);
	if (!msg) {
		ret = -ENOMEM;
		goto out;
	}
	msg_img = MSG_IMG(msg);
	payload = (uint8_t *)user_data + sizeof(ioctl_hdr_t);
	if (copy_from_user(msg_img, payload, msg_size)) {
		msg_varsized_uninit(query_msg);
		EPRINTF("'copy_from_user()' failure");
		ret = -EFAULT;
		goto out;
	}
	ret = 0;
out:
	DPRINTF("ret=%li", ret);
	return ret;
}

static long transport_ioctl_copy_to_user(ioctl_hdr_t *ioctl_hdr,
		msg_sized_t *reply_msg, void __user *user_data)
{
	long ret;

	size_t msg_size = MSG_SIZE(reply_msg);
	size_t capacity;
	void *payload;
	msg_img_t *msg_img;

	ioctl_hdr->size = msg_size;
	if (copy_to_user(user_data, ioctl_hdr, sizeof(ioctl_hdr_t))) {
		EPRINTF("'copy_to_user()' failure");
		ret = -EFAULT;
		goto out;
	}
	capacity = ioctl_hdr->capacity;
	if (capacity < msg_size) {
		WPRINTF("capacity=%zu < msg_size=%zu", capacity, msg_size);
		ret = -ENOSPC;
		goto out;
	}
	payload = (uint8_t *)user_data + sizeof(ioctl_hdr_t);
	msg_img = MSG_IMG(reply_msg);
	if (copy_to_user(payload, msg_img, msg_size)) {
		EPRINTF("'copy_to_user()' failure");
		ret = -EFAULT;
		goto out;
	}
	ret = 0;
out:
	DPRINTF("ret=%li", ret);
	return ret;
}

long transport_device_ioctl(struct file *filp, unsigned int cmd,
		unsigned long arg)
{
	transport_t *transport = filp->private_data;
	long ret;
	if (transport->shutdown) {
		ret = -EIO;
		goto out;
	}
	switch (cmd) {
	case IOCTL_WRITE_AND_READ_MSG:
	case IOCTL_READ_VERSION:
	{
		ioctl_hdr_t ioctl_hdr;
		void *user_data = (void *)arg;
		msg_varsized_t query_msg;
		ret = transport_ioctl_copy_from_user(&ioctl_hdr, &query_msg, user_data);
		if (!ret) {
			msg_varsized_t reply_msg;
			ret = transport_ioctl_write_read_msg(transport, &reply_msg, MSG_VARSIZED_GET_SIZED(&query_msg));
			if (!ret) {
				ret = transport_ioctl_copy_to_user(&ioctl_hdr, MSG_VARSIZED_GET_SIZED(&reply_msg), user_data);
				msg_varsized_uninit(&reply_msg);
			}
			msg_varsized_uninit(&query_msg);
		}
		break;
	}
	default:
		EPRINTF("Unexpected IOCTL cmd=%u", cmd);
		ret = -ENOIOCTLCMD;
	}
out:
	if (-EINVAL == ret) {
		transport_shutdown(transport);
	}
	DPRINTF("ret=%li", ret);
	return ret;
}

// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

ssize_t transport_device_read(struct file *filp, char __user *user_data,
			      size_t size, loff_t *offset)
{
	msg_t *msg;
	transport_t *transport = filp->private_data;

	size_t img_size;
	ssize_t ret;

	if (filp->f_flags & O_NONBLOCK) {
		EPRINTF("'non-blocking' mode is not supported yet");
		ret = -EINVAL;
		transport_shutdown(transport);
		goto out;
	}

	if (!size) {
		EPRINTF("'empty read' is not supported");
		ret = -EINVAL;
		transport_shutdown(transport);
		goto out;
	}

	task_info_map_delete_exited();

retry_wait:
	// We may start with 'wait*()' because it itself starts
	// with 'condition' check.
	if (wait_event_interruptible_exclusive(transport->msg_wait_queue,
			transport->shutdown
			|| !ring_is_empty(&transport->msg_ring))) {
		ret = -EINTR;
		goto out;
	}

	// Lock the state and check if processing is actually possible.
	spin_lock(&transport->msg_spinlock);
	{
		if (transport->shutdown) {
			ret = -EIO;
			spin_unlock(&transport->msg_spinlock);
			goto out;
		}

		if (ring_is_empty(&transport->msg_ring)) {
			WPRINTF("wakeup without messages");
			spin_unlock(&transport->msg_spinlock);
			goto retry_wait;
		}
		msg = *(msg_t **) ring_consumer_ptr(&transport->msg_ring);
		img_size = msg->img_size;
		DPRINTF("size=%zu img_size=%zu", size, img_size);
		if (size < img_size) {
			ret = -ENOSPC;
			spin_unlock(&transport->msg_spinlock);
			goto out;
		}
		ring_consumer_index_move_one(&transport->msg_ring);
	}
	spin_unlock(&transport->msg_spinlock);

	// 'copy_to_user' MAY sleep (for example in page fault handler)
	if (copy_to_user(user_data, &msg->img, img_size)) {
		WPRINTF("'copy_to_user()' failure");
		ret = -EFAULT;
		transport_shutdown(transport);
	} else {
		ret = img_size;
	}
	msg_unref(msg);
out:
	DPRINTF("ret=%zi", ret);
	return ret;
}

// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

// Forward declaration, to be available in the end of the code
static long wait_msg_killable_timeout(msg_t* msg, unsigned long timeout_jiffies);

static void msg_wait_reply(msg_t *msg)
{
	long ret;

	// Do not block 'control' process
	if (atomic_read(&msg->reply_wait_count)
	&& MSG_TYPE(msg) != MT_PONG
	&& transport_is_control_tgid(current->tgid)) {
		WPRINTF_RATELIMITED("avoiding blocking wait in control process");
		return;
	}

	// We may start with 'wait*()' because it itself starts
	// with 'condition' check.
	DPRINTF("waiting for userspace reply...");
	ret = wait_msg_killable_timeout(msg, msecs_to_jiffies(TRANSPORT_WAIT_REPLY_TIMEOUT_MSECS));
	if (!ret) {
		// Timeout here means unexpected issue with userspace.
		FPRINTF("timeout waiting for userspace reply (msg_type=%i/%s)",
				MSG_TYPE(msg),
				msg_type_to_string(MSG_TYPE(msg)));
		HEX_DUMP("msg: ", MSG_IMG(msg), MSG_SIZE(msg));
		dump_stack();
		// identify and shutdown transport failed to reply
		transport_global_shutdown_msg(msg);
	} else if (ret < 0) {
		// Calling process has been interrupted as SIGKILL was received.
		// In practice this means 'block'.
		DPRINTF("message was interrupted...");
		msg->interrupted = true;
	} else {
		// Userspace reply has been received (msg->reply_msg) or
		// waiting has been explicitly aborted (msg->aborted) for
		// example on userspace disconnect.
		DPRINTF("wait finished (msg->block=%i)", msg->block);
	}
}

static void msg_mark_async(msg_t *msg)
{
	MSG_ID(msg) = 0;
	MSG_REPLY(msg) = false;
}

void send_msg_async(msg_t *msg)
{
	DPRINTF("msg=%p", msg);
	msg_mark_async(msg);
	send_msg_nowait(msg);
	DPRINTF("");
}

void send_msg_async_unref(msg_t *msg)
{
	if (msg) {
		send_msg_async(msg);
		msg_unref(msg);
	}
}

static void msg_mark_sync(msg_t *msg)
{
	MSG_ID(msg) = transport_global_sequence_next();
	MSG_REPLY(msg) = false;
}

bool send_msg_sync_nowait(msg_t *msg)
{
	bool sent;
	DPRINTF("msg=%p", msg);
	msg_mark_sync(msg);
	sent = send_msg_nowait(msg);
	DPRINTF("msg=%p sent=%i", msg, sent);
	return sent;
}

void send_msg_sync(msg_t *msg)
{
	DPRINTF("msg=%p", msg);
	if (send_msg_sync_nowait(msg)) {
		msg_wait_reply(msg);
	}
	DPRINTF("");
}

void send_msg_sync_unref(msg_t *msg)
{
	if (msg) {
		send_msg_sync(msg);
		thread_safe_path_clear(&msg->path);
		msg_unref(msg);
	}
}

// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

static int transport_handle_ping_msg(transport_t *transport, msg_sized_t *ping)
{
	int ret;
	msg_t *pong;
	bool sync;

	if (ping->img_size < (sizeof(msg_img_t) + sizeof(ping_img_t))) {
		DPRINTF("'ping' message is too short. ignoring it.");
		ret = -EINVAL;
		goto out;
	}

	pong = pong_msg_new(ping);
	if (!pong) {
		ret = -ENOMEM;
		goto out;
	}

	// reflect ping's 'reply' policy
	sync = !!MSG_ID(ping);

	if (sync) {
		msg_mark_sync(pong);
	} else {
		msg_mark_async(pong);
	}

	transport_send_msg_nowait(transport, pong);
	if (sync) {
		msg_wait_reply(pong);
	}
	msg_unref(pong);
	ret = 0;
out:
	return ret;
}

static int transport_handle_set_pid_status_msg(msg_sized_t *msg)
{
	msg_img_t *msg_img;
	pid_set_st_img_t *img;
	pid_t pid;
	task_status_t status;
	int ret;

	if (msg->img_size < (sizeof(msg_img_t) + sizeof(pid_set_st_img_t))) {
		DPRINTF("'pid' message is too short. ignoring it.");
		ret = -EINVAL;
		goto out;
	}

	msg_img = MSG_IMG(msg);
	img = IMG_PAYLOAD(msg_img);
	pid = img->pid;
	status = img->status;

	ret = task_info_status_set(pid, status);
out:
	DPRINTF("ret=%i", ret);
	return ret;
}

static int transport_handle_del_pid_msg(msg_sized_t *msg)
{
	msg_img_t *msg_img;
	pid_del_img_t *img;
	pid_t pid;
	int ret;

	if (msg->img_size < (sizeof(msg_img_t) + sizeof(pid_del_img_t))) {
		DPRINTF("'pid' message is too short. ignoring it.");
		ret = -EINVAL;
		goto out;
	}

	msg_img = MSG_IMG(msg);
	img = IMG_PAYLOAD(msg_img);
	pid = img->pid;

	ret = task_info_map_del(pid);
out:
	DPRINTF("ret=%i", ret);
	return ret;
}

static int transport_handle_enable_events(transport_t *transport, msg_sized_t *msg)
{
	msg_img_t *msg_img;
	events_mask_img_t *img;
	uint64_t mask;
	uint64_t old_mask;
	int ret;

	if (msg->img_size < (sizeof(msg_img_t) + sizeof(events_mask_img_t))) {
		DPRINTF("'events' message is too short. ignoring it.");
		ret = -EINVAL;
		goto out;
	}

	msg_img = MSG_IMG(msg);
	img = IMG_PAYLOAD(msg_img);
	mask = img->events_mask;

	spin_lock(&transport->msg_spinlock);
	{
		if (transport->shutdown) {
			// Do not allow changing the mask when shutdown.
			// Transport will not be able to receive any events.
			ret = -EFAULT;
		} else {
			old_mask = atomic64_read(&transport->events_mask);
			atomic64_set(&transport->events_mask, old_mask | mask);
			ret = 0;
		}
	}
	spin_unlock(&transport->msg_spinlock);

	transport_global_recalculate_combined_event_mask();

out:
	DPRINTF("ret=%i", ret);
	return ret;
}

static int transport_handle_disable_events(transport_t *transport, msg_sized_t *msg)
{
	msg_img_t *msg_img;
	events_mask_img_t *img;
	uint64_t mask;
	uint64_t old_mask;
	int ret;

	if (msg->img_size < (sizeof(msg_img_t) + sizeof(events_mask_img_t))) {
		DPRINTF("'events' message is too short. ignoring it.");
		ret = -EINVAL;
		goto out;
	}

	msg_img = MSG_IMG(msg);
	img = IMG_PAYLOAD(msg_img);
	mask = img->events_mask;

	spin_lock(&transport->msg_spinlock);
	{
		if (transport->shutdown) {
			// Do not allow changing the mask when shutdown.
			// It is not particularly useful but keeping it.
			ret = -EFAULT;
		} else {
			old_mask = atomic64_read(&transport->events_mask);
			atomic64_set(&transport->events_mask, old_mask & (~mask));
			ret = 0;
		}
	}
	spin_unlock(&transport->msg_spinlock);

	transport_global_recalculate_combined_event_mask();

out:
	DPRINTF("ret=%i", ret);
	return ret;
}

// FIXME: do something with 'reply'. For example merge several replies
// into one; link replies into list; extract 'responces' and merge them.
static void handle_reply(msg_t *query_msg, msg_sized_t *reply_msg)
{
	// handle 'long' 'reply'
	size_t headers_size = sizeof(msg_img_t) + sizeof(reply_img_t);
	// Note: for compatibility with legacy short 'reply_img_t' default 'reply_type' is RT_ALLOW
	if (MSG_SIZE(reply_msg) >= headers_size) {
		msg_img_t *reply_msg_img = MSG_IMG(reply_msg);
		reply_img_t *reply_img = IMG_PAYLOAD(reply_msg_img);
		reply_type_t reply_type = reply_img->type;
		DPRINTF("MSG_SIZE(reply_msg)=%zu - headers_size=%zu = %zu reply_type=%u",
				MSG_SIZE(reply_msg), headers_size,
				MSG_SIZE(reply_msg) - headers_size, reply_type);
		if (RT_BLOCK == reply_type) {
			query_msg->block = true;
		}
	}
}

static int transport_handle_reply(transport_t *transport, msg_sized_t *reply)
{
	int ret;
	msg_id_t reply_id = MSG_ID(reply);
	msg_type_t reply_type = MSG_TYPE(reply);

	DPRINTF("");
	// find 'query' matching this 'reply'
	spin_lock(&transport->msg_spinlock);
	{
		void *item_ptr = set_begin_ptr(&transport->sent_msgs_set);
		void *end_ptr = set_end_ptr(&transport->sent_msgs_set);
		while (item_ptr < end_ptr) {
			msg_t *query = *(msg_t **) item_ptr;
			if (MSG_ID(query) == reply_id) {
				// remove 'query' from 'set'
				*(msg_t **) item_ptr = *(msg_t **) set_item_ptr(
						&transport->sent_msgs_set,
						set_items_count_dec(&transport->sent_msgs_set));

				handle_reply(query, reply);

				msg_reply_wait_count_dec(query);
				ret = 0;
				goto unlock;
			}
			item_ptr = set_ptr_next(&transport->sent_msgs_set, item_ptr);
		}
		WPRINTF("Unexpected 'reply' with type=%i id=%llX", reply_type, reply_id);
		ret = -EINVAL;
	}
unlock:
	spin_unlock(&transport->msg_spinlock);
	DPRINTF("ret=%i", ret);
	return ret;
}

static int transport_handle_msg(transport_t *transport, msg_sized_t *msg)
{
	int ret;

	if (msg->img_size < sizeof(msg_img_t)) {
		DPRINTF("message image is too small");
		ret = -EINVAL;
		goto out;
	}

	if (MSG_REPLY(msg)) {
		ret = transport_handle_reply(transport, msg);
	} else { // !reply
		action_type_t type = MSG_TYPE(msg);
		switch (type) {
		case AT_PING:
			ret = transport_handle_ping_msg(transport, msg);
			break;

		case AT_PID_SET_ST:
			ret = transport_handle_set_pid_status_msg(msg);
			break;

		case AT_PID_DEL:
			ret = transport_handle_del_pid_msg(msg);
			break;

		case AT_ENABLE_EVENTS:
			ret = transport_handle_enable_events(transport, msg);
			break;

		case AT_DISABLE_EVENTS:
			ret = transport_handle_disable_events(transport, msg);
			break;

		case AT_WAIT_SHARED_DATA_QUEUE:
			ret = transport_data_queue_wait(transport);
			break;

		case AT_EX_EVENTS_SUPPORTED:
			ret = 0;
			break;

		default:
			WPRINTF("Unexpected message type=%i/%s", type, action_type_to_string(type));
			ret = -EINVAL;
		}
	}

out:
	DPRINTF("ret=%i", ret);
	return ret;
}

ssize_t transport_device_write(struct file *filp, const char __user *user_data,
			       size_t size, loff_t *offset)
{
	transport_t *transport = filp->private_data;
	msg_varsized_t msg;
	msg_sized_t* smsg;
	msg_img_t *msg_img;
	ssize_t ret;

	if (transport->shutdown) {
		ret = -EIO;
		goto out;
	}

	if (filp->f_flags & O_NONBLOCK) {
		EPRINTF("'non-blocking' mode is not supported yet");
		ret = -EINVAL;
		transport_shutdown(transport);
		goto out;
	}

	if (!size) {
		WPRINTF("'zero write' is not supported");
		ret = -EINVAL;
		transport_shutdown(transport);
		goto out;
	}
	if (size > TRANSPORT_MSG_SIZE_MAX) {
		WPRINTF("size > TRANSPORT_MSG_SIZE_MAX");
		ret = -E2BIG;
		goto out;
	}
	smsg = msg_varsized_init(&msg, size);
	if (!smsg) {
		ret = -ENOMEM;
		goto out;
	}
	msg_img = MSG_IMG(smsg);
	if (copy_from_user(msg_img, user_data, size)) {
		EPRINTF("'copy_from_user()' failure");
		ret = -EFAULT;
		transport_shutdown(transport);
		goto free_msg;
	}
	ret = transport_handle_msg(transport, smsg);
	if (ret) {
		// make sure error code is negative
		if (ret > 0) {
			EPRINTF("error code must be negative");
			ret = -ret;
		}
		goto free_msg;
	}
	ret = size;
free_msg:
	msg_varsized_uninit(&msg);
out:
	DPRINTF("ret=%zi", ret);
	return ret;
}

// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

/*
    Warning: 'transport_open()' and 'transport_release()' may be
    simultaneously invoked by several threads or processes.

    Note: We can match different 'transport' instances using 'device'
    'major'/'minor' from 'inode->i_rdev'. Pointer to selected 'trasport'
    can be stored in 'filp->private_data' for later use in '*_read()',
    '*_write()', etc.

    Note: We may create 'transport' on 'first open' and destroy it on
    'last close'.
*/
/*
    There is possibility of 'deadlock' between our 'kernel' and
    'userspace' code while processing events generated by our userspace
    process until registration of our userspace process in 'ignore' list.
*/
int transport_device_open(struct inode *inode, struct file *filp)
{
	transport_t *transport;
	int ret;

	DPRINTF("inode->i_rdev: major=%u minor=%u", imajor(inode), iminor(inode));
	DPRINTF("filp->f_flags=%X", filp->f_flags);
	if (filp->f_flags & O_NONBLOCK) {
		EPRINTF("'non-blocking' mode is not supported yet");
		ret = -EINVAL;
		goto out;
	}

	mutex_lock(&transport_global.transport_mutex);
	{
		DPRINTF("transport_count=%u", transport_global.transport_count);

		transport = transport_new();
		if (!transport) {
			WPRINTF("'%s()' failure", "transport_new");
			ret = -ENOMEM;
			goto unlock_open_close_mutex;
		}
		filp->private_data = transport;

		if (!transport_global.transport_count) {
			// FIXME: 'attach' may fail
			IPRINTF("attaching interceptors");
			mod_rundown_protection_set_ready();
			ret = syscall_hooks_attach();
			if (ret) {
				EPRINTF("'%s()' failure %i", "syscall_hooks_attach", ret);
				goto unlock_open_close_mutex;
			}
			ret = tracepoints_attach();
			if (ret) {
				EPRINTF("'%s()' failure %i", "tracepoints_attach", ret);
				syscall_hooks_detach();
				goto unlock_open_close_mutex;
			}
			IPRINTF("interceptors attached");
		}
		++transport_global.transport_count;
		ret = 0;
	}
unlock_open_close_mutex:
	mutex_unlock(&transport_global.transport_mutex);
out:
	DPRINTF("ret=%i", ret);
	return ret;
}

// 'release()' means 'close()'
int transport_device_release(struct inode *inode, struct file *filp)
{
	bool ok;
	mutex_lock(&transport_global.transport_mutex);
	{
		transport_t *transport = filp->private_data;
		transport_shutdown(transport);
		transport_free(transport);

		DPRINTF("transport_count=%u", transport_global.transport_count);
		if (!--transport_global.transport_count) {
			IPRINTF("detaching interceptors");
			tracepoints_detach();
			// FIXME: 'syscall_hooks_detach()' may fail
			syscall_hooks_detach();
			mod_rundown_protection_set_rundown_active();
			ok = mod_rundown_protection_wait_for_rundown_timeout(msecs_to_jiffies(TRANSPORT_WAIT_RUNDOWN_TIMEOUT_MSECS));
			if (!ok) {
				WPRINTF("Failed to wait for module rundown");
			}
			task_info_map_clear();
			IPRINTF("interceptors detached");
		}
	}
	mutex_unlock(&transport_global.transport_mutex);
	return 0;
}

int transport_device_mmap(struct file *filp, struct vm_area_struct *vma)
{
	int ret;
	transport_t *transport = filp->private_data;
	if (transport->shutdown) {
		ret = -EIO;
		goto out;
	}

	ret = transport_data_queue_mmap(transport, vma);

out:
	return ret;
}

static long wait_msg_killable_timeout(msg_t* msg, unsigned long timeout_jiffies)
{
#ifndef HAVE_WAIT_EVENT_KILLABLE_TIMEOUT
	// 'wait_event_interruptible_timeout' has to be a define and so is
	// 'TASK_KILLABLE' and 'TASK_INTERRUPTIBLE'.
	// I need functionality of 'wait_event_interruptible_timeout'
	// but 'TASK_INTERRUPTIBLE' replaced with 'TASK_KILLABLE' which
	// is achieved using the 'define' tricks by redefining 'TASK_INTERRUPTIBLE'.
	// If the trick won't work, using the regular 'wait_event_timeout'.
#if defined(TASK_KILLABLE) && defined(TASK_INTERRUPTIBLE) && defined(wait_event_interruptible_timeout) && !defined(signal_pending)
#undef TASK_INTERRUPTIBLE
#define TASK_INTERRUPTIBLE TASK_KILLABLE
#define signal_pending fatal_signal_pending
	return wait_event_interruptible_timeout(msg->wait_queue,
		!atomic_read(&msg->reply_wait_count),
		timeout_jiffies);
#undef TASK_INTERRUPTIBLE
#undef signal_pending
#else
	// Something weird is going on, rollback to 'TASK_UNINTERRUPTIBLE' variant.
	// It should not cause any issues though as far as APL
	// daemon is responding to events so it is not bad.
	return wait_event_timeout(msg->wait_queue,
		!atomic_read(&msg->reply_wait_count),
		timeout_jiffies);
#endif
#else
	// Just use the well defined macros available.
	return wait_event_killable_timeout(msg->wait_queue,
		!atomic_read(&msg->reply_wait_count),
		timeout_jiffies);
#endif
}