diff --git a/docopt.h b/docopt.h new file mode 100644 index 0000000000000000000000000000000000000000..dc72026cab6db4ff07cd50ddd1580af0da32f7e5 --- /dev/null +++ b/docopt.h @@ -0,0 +1,65 @@ +// +// docopt.h +// docopt +// +// Created by Jared Grubb on 2013-11-03. +// Copyright (c) 2013 Jared Grubb. All rights reserved. +// + +#ifndef docopt__docopt_h_ +#define docopt__docopt_h_ + +#include "docopt_value.h" + +#include <map> +#include <vector> +#include <string> + +namespace docopt { + + // Usage string could not be parsed (ie, the developer did something wrong) + struct DocoptLanguageError : std::runtime_error { using runtime_error::runtime_error; }; + + // Arguments passed by user were incorrect (ie, developer was good, user is wrong) + struct DocoptArgumentError : std::runtime_error { using runtime_error::runtime_error; }; + + // Arguments contained '--help' and parsing was aborted early + struct DocoptExitHelp : std::runtime_error { DocoptExitHelp(); }; + + // Arguments contained '--version' and parsing was aborted early + struct DocoptExitVersion : std::runtime_error { DocoptExitVersion(); }; + + /// Parse user options from the given option string. + /// + /// @param doc The usage string + /// @param argv The user-supplied arguments + /// @param help Whether to end early if '-h' or '--help' is in the argv + /// @param version Whether to end early if '--version' is in the argv + /// @param options_first Whether options must precede all args (true), or if args and options + /// can be arbitrarily mixed. + /// + /// @throws DocoptLanguageError if the doc usage string had errors itself + /// @throws DocoptExitHelp if 'help' is true and the user has passed the '--help' argument + /// @throws DocoptExitVersion if 'version' is true and the user has passed the '--version' argument + /// @throws DocoptArgumentError if the user's argv did not match the usage patterns + std::map<std::string, value> docopt_parse(std::string const& doc, + std::vector<std::string> const& argv, + bool help = true, + bool version = true, + bool options_first = false); + + /// Parse user options from the given string, and exit appropriately + /// + /// Calls 'docopt_parse' and will terminate the program if any of the exceptions above occur: + /// * DocoptLanguageError - print error and terminate (with exit code -1) + /// * DocoptExitHelp - print usage string and terminate (with exit code 0) + /// * DocoptExitVersion - print version and terminate (with exit code 0) + /// * DocoptArgumentError - print error and usage string and terminate (with exit code -1) + std::map<std::string, value> docopt(std::string const& doc, + std::vector<std::string> const& argv, + bool help = true, + std::string const& version = {}, + bool options_first = false) noexcept; +} + +#endif /* defined(docopt__docopt_h_) */ diff --git a/docopt_private.h b/docopt_private.h new file mode 100644 index 0000000000000000000000000000000000000000..b323a2ee3fe1fad145e261945fc409d70fe4fd36 --- /dev/null +++ b/docopt_private.h @@ -0,0 +1,309 @@ +// +// docopt_private.h +// docopt +// +// Created by Jared Grubb on 2013-11-04. +// Copyright (c) 2013 Jared Grubb. All rights reserved. +// + +#ifndef docopt_docopt_private_h +#define docopt_docopt_private_h + +#include <vector> +#include <memory> +#include <unordered_set> + +#include "docopt_value.h" + +namespace docopt { + + class Pattern; + class LeafPattern; + + using PatternList = std::vector<std::shared_ptr<Pattern>>; + + // Utility to use Pattern types in std hash-containers + struct PatternHasher { + template <typename P> + size_t operator()(std::shared_ptr<P> const& pattern) const { + return pattern->hash(); + } + template <typename P> + size_t operator()(P const* pattern) const { + return pattern->hash(); + } + template <typename P> + size_t operator()(P const& pattern) const { + return pattern.hash(); + } + }; + + // Utility to use 'hash' as the equality operator as well in std containers + struct PatternPointerEquality { + template <typename P1, typename P2> + bool operator()(std::shared_ptr<P1> const& p1, std::shared_ptr<P2> const& p2) const { + return p1->hash()==p2->hash(); + } + template <typename P1, typename P2> + bool operator()(P1 const* p1, P2 const* p2) const { + return p1->hash()==p2->hash(); + } + }; + + // A hash-set that uniques by hash value + using UniquePatternSet = std::unordered_set<std::shared_ptr<Pattern>, PatternHasher, PatternPointerEquality>; + + + class Pattern { + public: + // flatten out children, stopping descent when the given filter returns 'true' + virtual std::vector<Pattern*> flat(bool (*filter)(Pattern const*)) = 0; + + // flatten out all children into a list of LeafPattern objects + virtual void collect_leaves(std::vector<LeafPattern*>&) = 0; + + // flatten out all children into a list of LeafPattern objects + std::vector<LeafPattern*> leaves(); + + // Attempt to find something in 'left' that matches this pattern's spec, and if so, move it to 'collected' + virtual bool match(PatternList& left, std::vector<std::shared_ptr<LeafPattern>>& collected) const = 0; + + virtual std::string const& name() const = 0; + + virtual bool hasValue() const { return false; } + + virtual size_t hash() const = 0; + + virtual ~Pattern() = default; + }; + + class LeafPattern + : public Pattern { + public: + LeafPattern(std::string name, value v = {}) + : fName(std::move(name)), + fValue(std::move(v)) + {} + + virtual std::vector<Pattern*> flat(bool (*filter)(Pattern const*)) override { + if (filter(this)) { + return { this }; + } + return {}; + } + + virtual void collect_leaves(std::vector<LeafPattern*>& lst) override final { + lst.push_back(this); + } + + virtual bool match(PatternList& left, std::vector<std::shared_ptr<LeafPattern>>& collected) const override; + + virtual bool hasValue() const override { return static_cast<bool>(fValue); } + + value const& getValue() const { return fValue; } + void setValue(value&& v) { fValue = std::move(v); } + + virtual std::string const& name() const override { return fName; } + + virtual size_t hash() const override { + size_t seed = typeid(*this).hash_code(); + hash_combine(seed, fName); + hash_combine(seed, fValue); + return seed; + } + + protected: + virtual std::pair<size_t, std::shared_ptr<LeafPattern>> single_match(PatternList const&) const = 0; + + private: + std::string fName; + value fValue; + }; + + class BranchPattern + : public Pattern { + public: + BranchPattern(PatternList children = {}) + : fChildren(std::move(children)) + {} + + Pattern& fix() { + UniquePatternSet patterns; + fix_identities(patterns); + fix_repeating_arguments(); + return *this; + } + + virtual std::string const& name() const override { + throw std::runtime_error("Logic error: name() shouldnt be called on a BranchPattern"); + } + + virtual value const& getValue() const { + throw std::runtime_error("Logic error: name() shouldnt be called on a BranchPattern"); + } + + virtual std::vector<Pattern*> flat(bool (*filter)(Pattern const*)) override { + if (filter(this)) { + return {this}; + } + + std::vector<Pattern*> ret; + for(auto& child : fChildren) { + auto sublist = child->flat(filter); + ret.insert(ret.end(), sublist.begin(), sublist.end()); + } + return ret; + } + + virtual void collect_leaves(std::vector<LeafPattern*>& lst) override final { + for(auto& child : fChildren) { + child->collect_leaves(lst); + } + } + + void setChildren(PatternList children) { + fChildren = std::move(children); + } + + PatternList const& children() const { return fChildren; } + + virtual void fix_identities(UniquePatternSet& patterns) { + for(auto& child : fChildren) { + // this will fix up all its children, if needed + if (auto bp = dynamic_cast<BranchPattern*>(child.get())) { + bp->fix_identities(patterns); + } + + // then we try to add it to the list + auto inserted = patterns.insert(child); + if (!inserted.second) { + // already there? then reuse the existing shared_ptr for that thing + child = *inserted.first; + } + } + } + + virtual size_t hash() const override { + size_t seed = typeid(*this).hash_code(); + hash_combine(seed, fChildren.size()); + for(auto const& child : fChildren) { + hash_combine(seed, child->hash()); + } + return seed; + } + private: + void fix_repeating_arguments(); + + protected: + PatternList fChildren; + }; + + class Argument + : public LeafPattern { + public: + using LeafPattern::LeafPattern; + + protected: + virtual std::pair<size_t, std::shared_ptr<LeafPattern>> single_match(PatternList const& left) const override; + }; + + class Command : public Argument { + public: + Command(std::string name, value v = value{false}) + : Argument(std::move(name), std::move(v)) + {} + + protected: + virtual std::pair<size_t, std::shared_ptr<LeafPattern>> single_match(PatternList const& left) const override; + }; + + class Option final + : public LeafPattern + { + public: + static Option parse(std::string const& option_description); + + Option(std::string shortOption, + std::string longOption, + int argcount = 0, + value v = value{false}) + : LeafPattern(longOption.empty() ? shortOption : longOption, + std::move(v)), + fShortOption(std::move(shortOption)), + fLongOption(std::move(longOption)), + fArgcount(argcount) + { + // From Python: + // self.value = None if value is False and argcount else value + if (argcount && v.isBool() && !v.asBool()) { + setValue(value{}); + } + } + + Option(Option const&) = default; + Option(Option&&) = default; + Option& operator=(Option const&) = default; + Option& operator=(Option&&) = default; + + using LeafPattern::setValue; + + std::string const& longOption() const { return fLongOption; } + std::string const& shortOption() const { return fShortOption; } + int argCount() const { return fArgcount; } + + virtual size_t hash() const override { + size_t seed = LeafPattern::hash(); + hash_combine(seed, fShortOption); + hash_combine(seed, fLongOption); + hash_combine(seed, fArgcount); + return seed; + } + + protected: + virtual std::pair<size_t, std::shared_ptr<LeafPattern>> single_match(PatternList const& left) const override; + + private: + std::string fShortOption; + std::string fLongOption; + int fArgcount; + }; + + class Required : public BranchPattern { + public: + using BranchPattern::BranchPattern; + + bool match(PatternList& left, std::vector<std::shared_ptr<LeafPattern>>& collected) const override; + }; + + class Optional : public BranchPattern { + public: + using BranchPattern::BranchPattern; + + bool match(PatternList& left, std::vector<std::shared_ptr<LeafPattern>>& collected) const override { + for(auto const& pattern : fChildren) { + pattern->match(left, collected); + } + return true; + } + }; + + class OptionsShortcut : public Optional { + using Optional::Optional; + }; + + class OneOrMore : public BranchPattern { + public: + using BranchPattern::BranchPattern; + + bool match(PatternList& left, std::vector<std::shared_ptr<LeafPattern>>& collected) const override; + }; + + class Either : public BranchPattern { + public: + using BranchPattern::BranchPattern; + + bool match(PatternList& left, std::vector<std::shared_ptr<LeafPattern>>& collected) const override; + }; +} + +#endif diff --git a/docopt_util.h b/docopt_util.h new file mode 100644 index 0000000000000000000000000000000000000000..75288c8c5cbaaccc523068973f35341cde58bb6d --- /dev/null +++ b/docopt_util.h @@ -0,0 +1,97 @@ +// +// docopt_util.h +// docopt +// +// Created by Jared Grubb on 2013-11-04. +// Copyright (c) 2013 Jared Grubb. All rights reserved. +// + +#ifndef docopt_docopt_util_h +#define docopt_docopt_util_h + + +namespace { + bool starts_with(std::string const& str, std::string const& prefix) + { + if (str.length() < prefix.length()) + return false; + return std::equal(prefix.begin(), prefix.end(), + str.begin()); + } + + std::string trim(std::string&& str, + const std::string& whitespace = " \t\n") + { + const auto strEnd = str.find_last_not_of(whitespace); + if (strEnd==std::string::npos) + return {}; // no content + str.erase(strEnd+1); + + const auto strBegin = str.find_first_not_of(whitespace); + str.erase(0, strBegin); + + return std::move(str); + } + + std::vector<std::string> split(std::string const& str, size_t pos = 0) + { + const char* const anySpace = " \t\r\n\v\f"; + + std::vector<std::string> ret; + while (pos != std::string::npos) { + auto start = str.find_first_not_of(anySpace, pos); + if (start == std::string::npos) break; + + auto end = str.find_first_of(anySpace, start); + auto size = end==std::string::npos ? end : end-start; + ret.emplace_back(str.substr(start, size)); + + pos = end; + } + + return ret; + } + + std::tuple<std::string, std::string, std::string> partition(std::string str, std::string const& point) + { + std::tuple<std::string, std::string, std::string> ret; + + auto i = str.find(point); + + if (i == std::string::npos) { + // no match: string goes in 0th spot only + } else { + std::get<2>(ret) = str.substr(i + point.size()); + std::get<1>(ret) = point; + str.resize(i); + } + std::get<0>(ret) = std::move(str); + + return ret; + } + + template <typename I> + std::string join(I iter, I end, std::string const& delim) { + if (iter==end) + return {}; + + std::string ret = *iter; + for(++iter; iter!=end; ++iter) { + ret.append(delim); + ret.append(*iter); + } + return ret; + } +} + +namespace docopt { + template <class T> + inline void hash_combine(std::size_t& seed, T const& v) + { + // stolen from boost::hash_combine + std::hash<T> hasher; + seed ^= hasher(v) + 0x9e3779b9 + (seed<<6) + (seed>>2); + } +} + +#endif diff --git a/docopt_value.h b/docopt_value.h new file mode 100644 index 0000000000000000000000000000000000000000..8f32778e8c0c890666a4777d3b1aa739710a4737 --- /dev/null +++ b/docopt_value.h @@ -0,0 +1,321 @@ +// +// value.h +// docopt +// +// Created by Jared Grubb on 2013-10-14. +// Copyright (c) 2013 Jared Grubb. All rights reserved. +// + +#ifndef docopt__value_h_ +#define docopt__value_h_ + +#include <string> +#include <vector> +#include <functional> // std::hash +#include <iosfwd> + +namespace docopt { + + /// A generic type to hold the various types that can be produced by docopt. + /// + /// This type can be one of: {bool, long, string, vector<string>}, or empty. + struct value { + /// An empty value + value() {} + + value(std::string); + value(std::vector<std::string>); + + explicit value(bool); + explicit value(long); + explicit value(int v) : value(static_cast<long>(v)) {} + + ~value(); + value(value const&); + value(value&&) noexcept; + value& operator=(value const&); + value& operator=(value&&) noexcept; + + // Test if this object has any contents at all + explicit operator bool() const { return kind != Kind::Empty; } + + // Test the type contained by this value object + bool isBool() const { return kind==Kind::Bool; } + bool isString() const { return kind==Kind::String; } + bool isLong() const { return kind==Kind::Long; } + bool isStringList() const { return kind==Kind::StringList; } + + // Throws std::invalid_argument if the type does not match + bool asBool() const; + long asLong() const; + std::string const& asString() const; + std::vector<std::string> const& asStringList() const; + + size_t hash() const noexcept; + + // equality is based on hash-equality + friend bool operator==(value const&, value const&); + friend bool operator!=(value const&, value const&); + + private: + enum class Kind { + Empty, + Bool, + Long, + String, + StringList + }; + + union Variant { + Variant() {} + ~Variant() { /* do nothing; will be destroyed by ~value */ } + + bool boolValue; + long longValue; + std::string strValue; + std::vector<std::string> strList; + }; + + static const char* kindAsString(Kind); + void throwIfNotKind(Kind expected) const; + + private: + Kind kind = Kind::Empty; + Variant variant {}; + }; + + /// Write out the contents to the ostream + std::ostream& operator<<(std::ostream&, value const&); +} + +namespace std { + template <> + struct hash<docopt::value> { + size_t operator()(docopt::value const& val) const noexcept { + return val.hash(); + } + }; +} + +namespace docopt { + inline + value::value(bool v) + : kind(Kind::Bool) + { + variant.boolValue = v; + } + + inline + value::value(long v) + : kind(Kind::Long) + { + variant.longValue = v; + } + + inline + value::value(std::string v) + : kind(Kind::String) + { + new (&variant.strValue) std::string(std::move(v)); + } + + inline + value::value(std::vector<std::string> v) + : kind(Kind::StringList) + { + new (&variant.strList) std::vector<std::string>(std::move(v)); + } + + inline + value::value(value const& other) + : kind(other.kind) + { + switch (kind) { + case Kind::String: + new (&variant.strValue) std::string(other.variant.strValue); + break; + + case Kind::StringList: + new (&variant.strList) std::vector<std::string>(other.variant.strList); + break; + + case Kind::Bool: + variant.boolValue = other.variant.boolValue; + break; + + case Kind::Long: + variant.longValue = other.variant.longValue; + break; + + case Kind::Empty: + default: + break; + } + } + + inline + value::value(value&& other) noexcept + : kind(other.kind) + { + switch (kind) { + case Kind::String: + new (&variant.strValue) std::string(std::move(other.variant.strValue)); + break; + + case Kind::StringList: + new (&variant.strList) std::vector<std::string>(std::move(other.variant.strList)); + break; + + case Kind::Bool: + variant.boolValue = other.variant.boolValue; + break; + + case Kind::Long: + variant.longValue = other.variant.longValue; + break; + + case Kind::Empty: + default: + break; + } + } + + inline + value::~value() + { + switch (kind) { + case Kind::String: + variant.strValue.~basic_string(); + break; + + case Kind::StringList: + variant.strList.~vector(); + break; + + case Kind::Empty: + case Kind::Bool: + case Kind::Long: + default: + // trivial dtor + break; + } + } + + inline + value& value::operator=(value const& other) { + // make a copy and move from it; way easier. + return *this = value{other}; + } + + inline + value& value::operator=(value&& other) noexcept { + // move of all the types involved is noexcept, so we dont have to worry about + // these two statements throwing, which gives us a consistency guarantee. + this->~value(); + new (this) value(std::move(other)); + + return *this; + } + + template <class T> + void hash_combine(std::size_t& seed, const T& v); + + inline + size_t value::hash() const noexcept + { + switch (kind) { + case Kind::String: + return std::hash<std::string>()(variant.strValue); + + case Kind::StringList: { + size_t seed = std::hash<size_t>()(variant.strList.size()); + for(auto const& str : variant.strList) { + hash_combine(seed, str); + } + return seed; + } + + case Kind::Bool: + return std::hash<bool>()(variant.boolValue); + + case Kind::Long: + return std::hash<long>()(variant.longValue); + + case Kind::Empty: + default: + return std::hash<void*>()(nullptr); + } + } + + inline + bool value::asBool() const + { + throwIfNotKind(Kind::Bool); + return variant.boolValue; + } + + inline + long value::asLong() const + { + // Attempt to convert a string to a long + if (kind == Kind::String) { + const std::string& str = variant.strValue; + std::size_t pos; + const long ret = stol(str, &pos); // Throws if it can't convert + if (pos != str.length()) { + // The string ended in non-digits. + throw std::runtime_error( str + " contains non-numeric characters."); + } + return ret; + } + throwIfNotKind(Kind::Long); + return variant.longValue; + } + + inline + std::string const& value::asString() const + { + throwIfNotKind(Kind::String); + return variant.strValue; + } + + inline + std::vector<std::string> const& value::asStringList() const + { + throwIfNotKind(Kind::StringList); + return variant.strList; + } + + inline + bool operator==(value const& v1, value const& v2) + { + if (v1.kind != v2.kind) + return false; + + switch (v1.kind) { + case value::Kind::String: + return v1.variant.strValue==v2.variant.strValue; + + case value::Kind::StringList: + return v1.variant.strList==v2.variant.strList; + + case value::Kind::Bool: + return v1.variant.boolValue==v2.variant.boolValue; + + case value::Kind::Long: + return v1.variant.longValue==v2.variant.longValue; + + case value::Kind::Empty: + default: + return true; + } + } + + inline + bool operator!=(value const& v1, value const& v2) + { + return !(v1 == v2); + } +} + +#endif /* defined(docopt__value_h_) */ diff --git a/graph.cpp b/graph.cpp new file mode 100644 index 0000000000000000000000000000000000000000..da7e97f678d9e53017f662d4e44adab50a0e0212 --- /dev/null +++ b/graph.cpp @@ -0,0 +1,198 @@ +// +// Created by Abderrahmane on 6/16/2018. +// +#include <algorithm> +#include <iostream> +#include <set> +#include <cmath> +#include "graph.h" +#include "hash.h" +#include "param.h" + + +namespace std { + + + + double compute_branch_edit_distance(Branch &Br1, Branch &Br2) { + double bed = 0; + double max_bed = 1+max(Br1.d_out, Br2.d_out)+max(Br1.d_in, Br2.d_in); + if (Br1.r != Br2.r) bed += 1; + bed += max(Br1.d_out, Br2.d_out); + bed += max(Br1.d_in, Br2.d_in); + if (max(Br1.d_out, Br2.d_out) == Br2.d_out) { + for (auto &a : Br1.es_out)bed -= min(a.second, Br2.es_out[a.first]); + } else { + for (auto &a : Br2.es_out)bed -= min(a.second, Br1.es_out[a.first]); + } + if (max(Br1.d_in, Br2.d_in) == Br2.d_in) { + for (auto &a : Br1.es_in)bed -= min(a.second, Br2.es_in[a.first]); + } else { + for (auto &a : Br2.es_in)bed -= min(a.second, Br1.es_in[a.first]); + } + return bed/max_bed; + } + + double euclidean_distance(const vector<double> &a,const vector<double> &b){ + double dist = 0; + for (int i=0;i<M;i++){ + dist += pow(abs(a[i]-b[i]),2); + } + return sqrt(dist); + } + + void update_BED(edge &e,uint32_t &branch_id,Branch &Br,unordered_map<uint32_t,vector<double>>& branches_distances, + vector<Branch> &prototype_branches,double &graph_size){ + + // read the edge + uint32_t src_id = get<F_S>(e); + uint32_t dst_id = get<F_D>(e); + string src_type = get<F_STYPE>(e); + string dst_type = get<F_DTYPE>(e); + string e_type = get<F_ETYPE>(e); + if (src_id==branch_id) // is a source branch + { + for (int i=0;i<M;i++){ + double max_bed= 1+max(Br.d_out, prototype_branches.at(i).d_out)+ + max(Br.d_in, prototype_branches.at(i).d_in); + double bed = branches_distances[branch_id].at(i)*max_bed; + bed = round(bed); + if (((Br.d_out+1) > prototype_branches.at(i).d_out) && + ( ( Br.es_out[e_type] +1) > prototype_branches.at(i).es_out[e_type] )){ + bed++; + } + else if (((Br.d_out+1) <= prototype_branches.at(i).d_out) && + ( ( Br.es_out[e_type] +1)<= prototype_branches.at(i).es_out[e_type] )){ + bed--; + } + + double new_max = 1+max(Br.d_out+1, prototype_branches.at(i).d_out)+ + max(Br.d_in, prototype_branches.at(i).d_in); + branches_distances[branch_id].at(i) =bed/new_max; + if ((bed / new_max)<0 || (bed / new_max)>1) exit(35); + } + // update the branch + Br.d_out++; + Br.es_out[e_type]++; + } + if (dst_id ==branch_id) // is a dst branch + { + for (int i=0;i<M;i++){ + double max_bed= 1+max(Br.d_out, prototype_branches.at(i).d_out)+ + max(Br.d_in, prototype_branches.at(i).d_in); + double bed = branches_distances[branch_id].at(i)*max_bed; + bed = round(bed); + if (((Br.d_in+1) > prototype_branches.at(i).d_in) && + ( ( Br.es_in[e_type] +1)> prototype_branches.at(i).es_in[e_type] )){ + bed++; + } + else if (((Br.d_in+1) <= prototype_branches.at(i).d_in) && + ( ( Br.es_in[e_type] +1)<= prototype_branches.at(i).es_in[e_type] )){ + bed--; + } + double new_max = 1+max(Br.d_out, prototype_branches.at(i).d_out)+ + max(Br.d_in+1, prototype_branches.at(i).d_in); + branches_distances[branch_id].at(i) =bed/new_max; + if ((bed / new_max)<0 || (bed / new_max)>1) exit(35); + } + // update the branch + Br.d_in++; + Br.es_in[e_type]++; + } + } + + + void update_graph_vector2 (uint32_t &gid,edge &e, + vector<Branch> &prototype_branches, + unordered_map<uint32_t, unordered_map<uint32_t, vector<double>>> &graphs_to_branches_distances, + unordered_map<uint32_t, unordered_map<uint32_t, Branch>> &graphs_to_branches, + unordered_map<uint32_t, graph_vector> &graphs_vectors, + unordered_map<uint32_t ,double > &graphs_sizes){ + + // read edge + uint32_t src_id = get<F_S>(e); + uint32_t dst_id = get<F_D>(e); + string src_type = get<F_STYPE>(e); + string dst_type = get<F_DTYPE>(e); + string e_type = get<F_ETYPE>(e); + + + // update src_id branch + graphs_to_branches[gid][src_id].r = src_type; + uint32_t d_out = graphs_to_branches[gid][src_id].d_out; + uint32_t d_in = graphs_to_branches[gid][src_id].d_in; + if ( (d_out+d_in)==0 ) { // a new branch + graphs_sizes[gid]+=1; + graphs_to_branches[gid][src_id].d_out=1; + graphs_to_branches[gid][src_id].es_out[e_type]=1; + int k = 0; + + for (auto &brp:prototype_branches) { + double bed = compute_branch_edit_distance(graphs_to_branches[gid][src_id], brp); + graphs_vectors[gid].at(k) *= 2*(graphs_sizes[gid]-1); + graphs_vectors[gid].at(k) += (1-bed); + graphs_vectors[gid].at(k)/=2*graphs_sizes[gid]; + graphs_to_branches_distances[gid][src_id].push_back(bed); + k++; + } + } + else{ // a old branch + + // substract the old impact of the branch src_id + for (int k=0;k<M;k++) { + graphs_vectors[gid].at(k) *= (2*graphs_sizes[gid]); + graphs_vectors[gid].at(k) -= (1-graphs_to_branches_distances[gid][src_id].at(k)) * (graphs_to_branches[gid][src_id].d_in + + graphs_to_branches[gid][src_id].d_out); + } + // update the BED's + graphs_sizes[gid]+=1; + update_BED(e,src_id,graphs_to_branches[gid][src_id],graphs_to_branches_distances[gid],prototype_branches,graphs_sizes[gid]); + + // add the new impact of the branch src_id + + for (int k=0;k<M;k++) { + graphs_vectors[gid].at(k) += (1- graphs_to_branches_distances[gid][src_id].at(k))*(graphs_to_branches[gid][src_id].d_in + + graphs_to_branches[gid][src_id].d_out); + graphs_vectors[gid].at(k) /= (2*graphs_sizes[gid]); + } + } + // update dst_id branch + graphs_to_branches[gid][dst_id].r = dst_type; + d_out = graphs_to_branches[gid][dst_id].d_out; + d_in = graphs_to_branches[gid][dst_id].d_in; + if ( (d_out+d_in)==0 ) { // a new branch + graphs_to_branches[gid][dst_id].d_in=1; + graphs_to_branches[gid][dst_id].es_in[e_type]=1; + int k = 0; + for (auto &brp:prototype_branches) { + double bed = compute_branch_edit_distance(graphs_to_branches[gid][dst_id], brp); + graphs_vectors[gid].at(k) *= 2*graphs_sizes[gid]; + graphs_vectors[gid].at(k) += (1-bed); + graphs_vectors[gid].at(k)/=2*graphs_sizes[gid]; + graphs_to_branches_distances[gid][dst_id].push_back(bed); + k++; + } + } + else{ // a old branch + // substract the old impact of the branch src_id + for (int k=0;k<M;k++) { + graphs_vectors[gid].at(k) *= (2*graphs_sizes[gid]); + graphs_vectors[gid].at(k) -= (1-graphs_to_branches_distances[gid][dst_id].at(k))*(graphs_to_branches[gid][dst_id].d_in + + graphs_to_branches[gid][dst_id].d_out); + } + // update the BED's + update_BED(e,dst_id,graphs_to_branches[gid][dst_id],graphs_to_branches_distances[gid],prototype_branches,graphs_sizes[gid]); + + // add the new impact of the branch src_id + for (int k=0;k<M;k++) { + graphs_vectors[gid].at(k) += (1- graphs_to_branches_distances[gid][dst_id].at(k))*(graphs_to_branches[gid][dst_id].d_in + + graphs_to_branches[gid][dst_id].d_out); + graphs_vectors[gid].at(k) /= (2*graphs_sizes[gid]); + } + } + } + + + + +} diff --git a/graph.h b/graph.h new file mode 100644 index 0000000000000000000000000000000000000000..bddb6fc60c690801b55ce57ce8c58d12c1f96fde --- /dev/null +++ b/graph.h @@ -0,0 +1,52 @@ +// +// Created by Abderrahmane on 6/16/2018. +// + +#ifndef TESTMAP_GRAPH_H +#define TESTMAP_GRAPH_H + +#include <vector> +#include <tuple> +#include <unordered_map> + +namespace std { + +// edge field indices + #define F_S 0 // source node id + #define F_STYPE 1 // source node type + #define F_D 2 // destination node id + #define F_DTYPE 3 // destination node type + #define F_ETYPE 4 // edge type + #define F_GID 5 // graph id (tag) + +// data structures + typedef struct Branch{ + string r; // the root of the branch + unordered_map<string, int> es_out; // the edge structure of the outgoing edges from r + unordered_map<string, int> es_in; // the edge structure of the incoming edges to r + uint32_t d_out; // the number of outgoing edges + uint32_t d_in; // the number of incoming edges + } Branch; + + typedef tuple<uint32_t, string, uint32_t, string, string, uint32_t> edge; + typedef unordered_map<pair<uint32_t,string>, vector<tuple<uint32_t,string,string>>> graph; // + typedef vector<double> graph_vector; // vector representation of a graph + + + double compute_branch_edit_distance(Branch &Br1,Branch &Br2); + + double euclidean_distance(const vector<double> &a,const vector<double> &b); + + + void update_BED(edge &e,uint32_t &branch_id,Branch &Br,unordered_map<uint32_t,vector<double>>& branches_distances, + vector<Branch> &prototype_branches,double &graph_size); + void update_graph_vector2 (uint32_t &gid,edge &e, + vector<Branch> &prototype_branches, + unordered_map<uint32_t, unordered_map<uint32_t, vector<double>>> &graphs_to_branches_distances, + unordered_map<uint32_t, unordered_map<uint32_t, Branch>> &graphs_to_branches, + unordered_map<uint32_t, graph_vector> &graphs_vectors, + unordered_map<uint32_t ,double > &graphs_sizes); + +} + +#endif //TESTMAP_GRAPH_H diff --git a/hash.h b/hash.h new file mode 100644 index 0000000000000000000000000000000000000000..446f5793ade9bfdd254c9baf273c6e2435da2737 --- /dev/null +++ b/hash.h @@ -0,0 +1,38 @@ +// +// Created by Abderrahmane on 6/18/2018. +// + +#ifndef TESTMAP_HASH_H +#define TESTMAP_HASH_H + +#include <string> +#include <vector> + +namespace std { + + +/* Combination hash from Boost */ + template <class T> + inline void hash_combine(size_t& seed, const T& v) + { + hash<T> hasher; + seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2); + } + + template<typename S, typename T> struct hash<pair<S, T>> + { + inline size_t operator()(const pair<S, T>& v) const + { + size_t seed = 0; + hash_combine(seed, v.first); + hash_combine(seed, v.second); + return seed; + } + }; +/* End combination hash from Boost */ + + + +} + +#endif //TESTMAP_HASH_H diff --git a/io.cpp b/io.cpp new file mode 100644 index 0000000000000000000000000000000000000000..809b34bb29a593f420106b4ea50f22c5635c706c --- /dev/null +++ b/io.cpp @@ -0,0 +1,163 @@ + + +#include <fcntl.h> +#include <fstream> +#include "graph.h" +#include "io.h" +#include <iostream> +#include "param.h" +#include <string> +#include <sstream> +#include <tuple> +#include <unistd.h> +#include "util.h" +#include <vector> +#include <algorithm> +#include <unordered_set> + +namespace std { + + + tuple<unordered_map<uint32_t, graph_vector>, vector<uint32_t>> read_train_graph_vectors(string filename) { + + unordered_map<uint32_t, graph_vector> graph_vectors; + vector<uint32_t> train_gids; + ifstream f(filename); + string line; + + while (getline(f, line)) { + uint32_t gid; + double e; + stringstream ss; + ss.str(line); + ss >> gid; + train_gids.push_back(gid); + while (ss >> e) { + graph_vectors[gid].push_back(e); + } + } + return make_tuple(graph_vectors, train_gids); + } + + vector<Branch> read_prototype_branches(string filename) { + + vector<Branch> prototype_branches; + ifstream f(filename); + string line; + while (getline(f, line)) { + Branch br; + stringstream ss; + ss.str(line); + ss >> br.r; + ss >> br.d_out; + ss >> br.d_in; + getline(f, line); + if (br.d_out > 0) { + stringstream ss; + ss.str(line); + string c; + while (ss >> c) { + int f; + ss >> f; + if (f > 0) br.es_out[c] = f; + } + } + getline(f, line); + if (br.d_in > 0) { + stringstream ss; + ss.str(line); + string c; + int f; + while (ss >> c) { + ss >> f; + if (f > 0) br.es_in[c] = f; + } + } + prototype_branches.push_back(br); + } + + return prototype_branches; + } + + tuple<uint32_t, unordered_map<uint32_t, vector<edge>>> read_edges + (string filename, vector<uint32_t> &train_gids, unordered_set<uint32_t> scenarios) { + + + cerr << "Reading edges from: " << filename << endl; + vector<edge> train_edges; + unordered_map<uint32_t, vector<edge>> test_edges; + uint32_t num_train_edges = 0; + ifstream f(filename); + string line; + + + // read edges from the file + uint32_t i = 0; + uint32_t max_gid = 0; + + + while (getline(f, line)) { + string src_type, dst_type, e_type; + uint32_t src_id, dst_id, graph_id; + stringstream ss; + ss.str(line); + ss >> src_id; + ss >> src_type; + ss >> dst_id; + ss >> dst_type; + ss >> e_type; + ss >> graph_id; + if (graph_id > max_gid) { + max_gid = graph_id; + } + + i++; // skip newline + + + uint32_t scenario = graph_id / 100; + if (scenarios.find(scenario) != scenarios.end()) { + // add an edge to memory + if (find(train_gids.begin(), train_gids.end(), graph_id) == train_gids.end()) { // is a test edge + test_edges[graph_id].push_back(make_tuple(src_id, src_type, + dst_id, dst_type, + e_type, graph_id)); + + } + } + } + return make_tuple(max_gid + 1, test_edges); + } + + tuple<vector<vector<uint32_t>>, vector<double>, double> read_bootstrap_clusters(string bootstrap_file) { + int nclusters; + double global_threshold; + ifstream f(bootstrap_file); + string line; + stringstream ss; + + getline(f, line); + int num_graphs; + ss.str(line); + ss >> nclusters >> num_graphs >> global_threshold; + cout << nclusters << " , " << global_threshold << endl; + vector<double> cluster_thresholds(nclusters); + vector<vector<uint32_t>> clusters(nclusters); + + for (int i = 0; i < nclusters; i++) { + getline(f, line); + ss.clear(); + ss.str(line); + + double cluster_threshold; + ss >> cluster_threshold; + cluster_thresholds[i] = cluster_threshold; + + double gid; + while (ss >> gid) { + clusters[i].push_back((int) gid); + } + } + + return make_tuple(clusters, cluster_thresholds, global_threshold); + } +} \ No newline at end of file diff --git a/io.h b/io.h new file mode 100644 index 0000000000000000000000000000000000000000..34f71b4d84014c5cfd30182784387d23cabd4a9c --- /dev/null +++ b/io.h @@ -0,0 +1,22 @@ + +#ifndef STREAMSPOT_IO_H_ +#define STREAMSPOT_IO_H_ + +#include "graph.h" +#include <string> +#include <tuple> +#include <vector> +#include <unordered_set> + +namespace std { + + vector<Branch> read_prototype_branches(string filename); + tuple<unordered_map<uint32_t, graph_vector>,vector<uint32_t>> read_train_graph_vectors(string filename); + + tuple<uint32_t,unordered_map<uint32_t,vector<edge>>> read_edges (string filename,vector<uint32_t> &train_gids,unordered_set<uint32_t> scenarios); + + + tuple<vector<vector<uint32_t>>, vector<double>, double> read_bootstrap_clusters(string bootstrap_file); +} + +#endif diff --git a/main.cpp b/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..03c4fb06b2ba9cc1b237b8e65befac8bf4053e9d --- /dev/null +++ b/main.cpp @@ -0,0 +1,338 @@ + + +#include <algorithm> +#include <bitset> +#include <cassert> +#include <deque> +#include <iostream> + +#include <string> +#include <unordered_map> +#include <set> +#include <unordered_set> +#include <vector> +#include <algorithm> +#include <sstream> +#include <random> +#include <chrono> +#include <fstream> + + +#include "docopt.h" +#include "graph.h" +#include "hash.h" +#include "io.h" +#include "param.h" +#include "cluster.h" + + +using namespace std; + +static const char USAGE[] = + R"( LEADS (Anomaly detection phase). + + Usage: + LEADS --edges=<edge file> + --prototypes=<prototype branches file> + --train=<train graphs file> + --clusters=<bootstrap cluster file> + --dataset=<dataset> + --B=<par> + --out=<output> + + LEADS (-h | --help) + + Options: + -h, --help Show this screen. + --edges=<edge file> Incoming stream of edges. + --prototypes=<prototype branches file> The prototype branhces + --train=<train graphs file> train graph vectors + --clusters=<bootstrap cluster file> bootstrap cluster file + --dataset=<dataset> 'ALL', 'YDC', 'GFC', 'YDG','AUTH,. + --B=<par> number of parallel graphs that arrive simulatenously +)"; +int M; +int main(int argc, char *argv[]) { + + // for measuring time + chrono::time_point<chrono::steady_clock> start; + chrono::time_point<chrono::steady_clock> end; + chrono::nanoseconds diff; + // for random + mt19937_64 prng(SEED); + + + // arguments + map<string, docopt::value> args = docopt::docopt(USAGE, {argv + 1, argv + argc}); + string edge_file(args["--edges"].asString()); + string prototypes_file(args["--prototypes"].asString()); + string train_vector_file(args["--train"].asString()); + string cluters_file (args["--clusters"].asString()); + string dataset(args["--dataset"].asString()); + string output_file(args["--out"].asString()); + uint32_t num_graphs; + + vector<uint32_t> train_gids; + unordered_map<uint32_t,vector<edge>> test_edges; + long par(args["--B"].asLong()); + + + + if (!(dataset.compare("ALL") == 0 || + dataset.compare("AUTH")== 0 || + dataset.compare("YDC") == 0 || + dataset.compare("YDG") == 0 || + dataset.compare("GFC") == 0)) { + cout << "Invalid dataset: " << dataset << ". "; + exit(-1); + } + unordered_set<uint32_t> scenarios; + if (dataset.compare("GFC") == 0) { + scenarios.insert(1); + scenarios.insert(2); + scenarios.insert(5); + scenarios.insert(3); // attack + } else if (dataset.compare("YDC") == 0) { + scenarios.insert(0); + scenarios.insert(4); + scenarios.insert(5); + scenarios.insert(3); // attack + }else if (dataset.compare("YDG")==0) { + scenarios.insert(0); + scenarios.insert(4); + scenarios.insert(1); + scenarios.insert(3); // attack + scenarios.insert(6); + }else if (dataset.compare("AUTH")==0){ + scenarios.insert(0); + scenarios.insert(1); + } else { // ALL + scenarios.insert(0); + scenarios.insert(1); + scenarios.insert(2); + scenarios.insert(3); // attack + scenarios.insert(4); + scenarios.insert(5); + scenarios.insert(6); + scenarios.insert(7); + } + + // the prototype branches + vector<Branch> prototype_branches; + prototype_branches = read_prototype_branches(prototypes_file); + M = prototype_branches.size(); + cout << "M = " << M << endl; + // read train graph vectors branches + unordered_map<uint32_t, graph_vector> graphs_vectors; // key = gid , value = the graph vector + tie (graphs_vectors,train_gids) = read_train_graph_vectors(train_vector_file); + // read bootstrap clusters and thresholds + vector<vector<uint32_t>> clusters; + vector<double> cluster_thresholds; + double global_threshold; + tie(clusters, cluster_thresholds, global_threshold) = read_bootstrap_clusters(cluters_file); + + // read the test edges + + tie(num_graphs,test_edges) = read_edges(edge_file, train_gids,scenarios); + ///num_graphs = 600; + + + + + // initialization of graphs vectors + for (int i=0;i<num_graphs;i++){ + if (find(train_gids.begin(),train_gids.end(),i)==train_gids.end()) { // is a test graph + for (int j=0;j<M;j++) graphs_vectors[i].push_back(0); + } + } + num_graphs = 700; + if (dataset.compare("AUTH")==0){ + num_graphs = 143; + } + vector<int> cluster_map(num_graphs, UNSEEN); // gid -> cluster id + + unordered_map<uint32_t, unordered_map<uint32_t, Branch>> graphs_to_branches ; + + unordered_map<uint32_t ,double> graphs_sizes; // key = gid ; value = the size of the graph + // initialization of graphs size + int gid = 0; + for (int i=0;i<num_graphs;i++){ + graphs_sizes[gid]=0; + gid++; + } + + // make groups of size par (parallel flowing graphs) + vector<uint32_t> test_gids; + for (uint32_t i = 0; i < num_graphs; i++) { + uint32_t scenario = i / 100; + if (scenarios.find(scenario) != scenarios.end()) { + if (find(train_gids.begin(), train_gids.end(), i) == train_gids.end()) { + test_gids.push_back(i); + } + } + } + shuffle(test_gids.begin(), test_gids.end(), prng); + vector<vector<uint32_t>> groups; + for (uint32_t i = 0; i < test_gids.size(); i += par) { + vector<uint32_t> group; + + uint32_t end_limit; + if (test_gids.size() - i < par) { + end_limit = test_gids.size() - i; + } else { + end_limit = par; + } + + for (uint32_t j = 0; j < end_limit; j++) { + group.push_back(test_gids[i + j]); + } + groups.push_back(group); + } + + /// create train graph centers + + vector<vector<double>> centroid_vectors; + // construct cluster centroid sketches/projections + cout << "Constructing bootstrap cluster centroids:" << endl; + centroid_vectors = construct_centroid_vectors(graphs_vectors,clusters); + cout << "end" << endl; + /// cluster map initialization + cout << clusters.size() << endl; + vector<uint32_t> cluster_sizes(clusters.size()); + for (uint32_t i = 0; i < clusters.size(); i++) { + cluster_sizes[i] = clusters[i].size(); + for (auto& gid : clusters[i]) { + cluster_map[gid] = i; + } + } + /// compute distances of training graphs to their cluster centroids + vector<double> anomaly_scores(num_graphs, UNSEEN); + for (auto& gid : train_gids) { + // anomaly score is a "distance" + anomaly_scores[gid] = euclidean_distance(graphs_vectors[gid],centroid_vectors[cluster_map[gid]]); + } + cout << "end anomaly initialization" << endl; + // add test edges to graphs + uint32_t num_test_edges = 0; + for (auto &me : test_edges) num_test_edges+= me.second.size(); + + cout << "Streaming in " << num_test_edges << " test edges:" << endl; + uint32_t num_intervals = ceil(static_cast<double>(num_test_edges) / + CLUSTER_UPDATE_INTERVAL); + if (num_intervals == 0) + num_intervals = 1; // if interval length is too long + vector<vector<double>> anomaly_score_iterations(num_intervals, + vector<double>(num_graphs)); + vector<vector<int>> cluster_map_iterations(num_intervals, + vector<int>(num_graphs)); + cout << "begin group loop" << endl; + uint32_t edge_num = 0; + vector<uint32_t> anoumalous_gids; + if (dataset.compare("AUTH")==0){ + anoumalous_gids.push_back(139); + anoumalous_gids.push_back(140); + anoumalous_gids.push_back(141); + anoumalous_gids.push_back(142); + } + else{ + for (uint32_t gid =300;gid<400;gid++) anoumalous_gids.push_back(gid); + for (uint32_t gid =600;gid<700;gid++) anoumalous_gids.push_back(gid); + } + + start = chrono::steady_clock::now(); + for (auto& group : groups) { + /// added + + unordered_map<uint32_t, unordered_map<uint32_t, vector<double>>> graphs_to_branches_distances; + + unordered_map<uint32_t,uint32_t> edge_offset; + for (auto &g : group) + edge_offset[g] = 0; + + vector<uint32_t> group_copy(group); + + while (group_copy.size() > 0) { + + // used to pick a random graph + uniform_int_distribution<uint32_t> rand_group_idx(0, group_copy.size() - 1); + + uint32_t gidx = rand_group_idx(prng); + uint32_t gid = group_copy[gidx]; + uint32_t off = edge_offset[gid]; + auto& e = test_edges[gid][off]; + if (get<F_GID>(e)!=gid ) exit(70); + // process edge + + auto old_gv = graphs_vectors[gid]; + + update_graph_vector2 (gid,e, + prototype_branches, + graphs_to_branches_distances, + graphs_to_branches, + graphs_vectors, + graphs_sizes); + + + // store current anomaly scores and cluster assignments + detect_anomalies(gid,graphs_vectors[gid],cluster_map,anomaly_scores,centroid_vectors,cluster_sizes,global_threshold,cluster_thresholds); + if ( edge_num % CLUSTER_UPDATE_INTERVAL ==0 || edge_num == num_test_edges-1 ){ + // cout << edge_num << endl; + anomaly_score_iterations[edge_num/CLUSTER_UPDATE_INTERVAL] = anomaly_scores; + cluster_map_iterations[edge_num/CLUSTER_UPDATE_INTERVAL] = cluster_map; + } + edge_num++; + edge_offset[gid]++; // increment next edge offset + if (edge_offset[gid] == test_edges[gid].size()) { + // out of edges for this gid + group_copy.erase(group_copy.begin() + gidx); + graphs_to_branches[gid].clear(); + test_edges[gid].clear(); + } + } + int TP = 0; + int FP=0; + int FN=0; + int TN = 0; + for (uint32_t j = 0; j < num_graphs ; j++) { + int k = cluster_map_iterations[edge_num / CLUSTER_UPDATE_INTERVAL][j]; + if ( find(anoumalous_gids.begin(),anoumalous_gids.end(),j)!=anoumalous_gids.end()) { + // cout << cluster_map_iterations[edge_num / CLUSTER_UPDATE_INTERVAL][113] << endl; + if ( k== ANOMALY && k!=UNSEEN) TP++; + if ( k!=UNSEEN && k!=ANOMALY) FN++; + } + else { + if ( k!=UNSEEN && k!=ANOMALY) TN++; + if (k==ANOMALY) FP++; + } + } + cout << "scores : " << TP <<" "<< FP << " " << TN << " "<< FN <<" " << endl; + for (auto& item:graphs_to_branches_distances) { + for (auto& item1:item.second) vector<double>().swap(item1.second); + } + graphs_to_branches_distances.clear(); + } + end = chrono::steady_clock::now(); + diff = chrono::duration_cast<chrono::nanoseconds>(end - start); + cout << static_cast<double>(diff.count()) << "us" << endl; + + ofstream out_file; + out_file.open(output_file); + + // cout << "Iterations " << num_intervals << endl; + for (uint32_t i = 0; i < num_intervals; i++) { + auto& a = anomaly_score_iterations[i]; + auto& c = cluster_map_iterations[i]; + for (uint32_t j = 0; j < a.size(); j++) { + out_file << a[j] << ","; + } + out_file << endl; + for (uint32_t j = 0; j < a.size(); j++) { + out_file << c[j] << ","; + } + out_file << endl; + } + out_file << "execution time : " << static_cast<double>(diff.count()) << "ns" << endl; + cout << "execution time : " << static_cast<double>(diff.count()) << "ns" << endl; + + return 0; +} diff --git a/param.h b/param.h new file mode 100644 index 0000000000000000000000000000000000000000..52985742627f53a2d3e981d906d3f4d014e20943 --- /dev/null +++ b/param.h @@ -0,0 +1,24 @@ + +#ifndef STREAMSPOT_PARAM_H_ +#define STREAMSPOT_PARAM_H_ + +#ifdef DEBUG +#define NDEBUG 0 +#endif + +extern int M; +#define B 100 +#define R 20 +#define BUF_SIZE 50 +#define DELIMITER '\t' +#define L 1000 // must be = B * R +#define SEED 23 +#define CLUSTER_UPDATE_INTERVAL 100 +#define INF 5000000 +#define UNSEEN -2 +#define ANOMALY -1 + + +#define PI 3.1415926535897 + +#endif diff --git a/util.h b/util.h new file mode 100644 index 0000000000000000000000000000000000000000..a779a8d038f8aaee917f1f3dabe7ad5c4727c31e --- /dev/null +++ b/util.h @@ -0,0 +1,23 @@ +/* + * Copyright 2016 Emaad Ahmed Manzoor + * License: Apache License, Version 2.0 + * http://www3.cs.stonybrook.edu/~emanzoor/streamspot/ + */ + +#ifndef STREAMSPOT_UTIL_H_ +#define STREAMSPOT_UTIL_H_ + +#include <string> +#include <iostream> + +namespace std { + +inline void panic(string message) { + cout << message << endl; + exit(-1); +} + + +} + +#endif