diff -Nru volk-1.3/apps/CMakeLists.txt volk-1.4/apps/CMakeLists.txt --- volk-1.3/apps/CMakeLists.txt 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/apps/CMakeLists.txt 2018-03-26 22:52:55.000000000 +0000 @@ -36,6 +36,7 @@ add_executable(volk_profile ${CMAKE_CURRENT_SOURCE_DIR}/volk_profile.cc ${PROJECT_SOURCE_DIR}/lib/qa_utils.cc + ${CMAKE_CURRENT_SOURCE_DIR}/volk_option_helpers.cc ) @@ -53,13 +54,14 @@ ) # MAKE volk-config-info -add_executable(volk-config-info volk-config-info.cc) +add_executable(volk-config-info volk-config-info.cc ${CMAKE_CURRENT_SOURCE_DIR}/volk_option_helpers.cc + ) if(ENABLE_STATIC_LIBS) - target_link_libraries(volk-config-info volk_static ${Boost_LIBRARIES}) + target_link_libraries(volk-config-info volk_static) set_target_properties(volk-config-info PROPERTIES LINK_FLAGS "-static") else() - target_link_libraries(volk-config-info volk ${Boost_LIBRARIES}) + target_link_libraries(volk-config-info volk) endif() install( diff -Nru volk-1.3/apps/volk-config-info.cc volk-1.4/apps/volk-config-info.cc --- volk-1.3/apps/volk-config-info.cc 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/apps/volk-config-info.cc 2018-03-26 22:52:55.000000000 +0000 @@ -1,6 +1,6 @@ /* -*- c++ -*- */ /* - * Copyright 2013, 2016 Free Software Foundation, Inc. + * Copyright 2013, 2016, 2018 Free Software Foundation, Inc. * * This file is part of GNU Radio * @@ -24,88 +24,52 @@ #include #endif -#include -#include "volk/volk.h" -#include -#include +#include // for volk_available_machines, volk_c_com... +#include // for operator<<, endl, cout, ostream +#include // for string + +#include "volk/volk.h" // for volk_get_alignment, volk_get_machine +#include "volk_option_helpers.h" // for option_list, option_t + +void print_alignment() +{ + std::cout << "Alignment in bytes: " << volk_get_alignment() << std::endl; +} + +void print_malloc() +{ + // You don't want to change the volk_malloc code, so just copy the if/else + // structure from there and give an explanation for the implementations + std::cout << "Used malloc implementation: "; + #if _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN + std::cout << "posix_memalign" << std::endl; + #elif _MSC_VER >= 1400 + std::cout << "aligned_malloc" << std::endl; + #else + std::cout << "No standard handler available, using own implementation." << std::endl; + #endif +} -namespace po = boost::program_options; int main(int argc, char **argv) { - po::options_description desc("Program options: volk-config-info [options]"); - po::variables_map vm; - desc.add_options() - ("help,h", "print help message") - ("prefix", "print VOLK installation prefix") - ("cc", "print VOLK C compiler version") - ("cflags", "print VOLK CFLAGS") - ("all-machines", "print VOLK machines built into library") - ("avail-machines", "print VOLK machines the current platform can use") - ("machine", "print the VOLK machine that will be used") - ("alignment", "print the alignment that will be used") - ("malloc", "print malloc implementation that will be used") - ("version,v", "print VOLK version") - ; - - try { - po::store(po::parse_command_line(argc, argv, desc), vm); - po::notify(vm); - } - catch (po::error& error){ - std::cerr << "Error: " << error.what() << std::endl << std::endl; - std::cerr << desc << std::endl; - return 1; - } - - if(vm.size() == 0 || vm.count("help")) { - std::cout << desc << std::endl; - return 1; - } - - if(vm.count("prefix")) - std::cout << volk_prefix() << std::endl; - - if(vm.count("version")) - std::cout << volk_version() << std::endl; - - if(vm.count("cc")) - std::cout << volk_c_compiler() << std::endl; - - if(vm.count("cflags")) - std::cout << volk_compiler_flags() << std::endl; - - // stick an extra ';' to make output of this and avail-machines the - // same structure for easier parsing - if(vm.count("all-machines")) - std::cout << volk_available_machines() << ";" << std::endl; - - if(vm.count("avail-machines")) { - volk_list_machines(); - } - - if(vm.count("machine")) { - std::cout << volk_get_machine() << std::endl; - } - - if(vm.count("alignment")) { - std::cout << "Alignment in bytes: " << volk_get_alignment() << std::endl; - } + option_list our_options("volk-config-info"); + our_options.add(option_t("prefix", "", "print the VOLK installation prefix", volk_prefix())); + our_options.add(option_t("cc", "", "print the VOLK C compiler version", volk_c_compiler())); + our_options.add(option_t("cflags", "", "print the VOLK CFLAGS", volk_compiler_flags())); + our_options.add(option_t("all-machines", "", "print VOLK machines built", volk_available_machines())); + our_options.add(option_t("avail-machines", "", "print VOLK machines on the current " + "platform", volk_list_machines)); + our_options.add(option_t("machine", "", "print the current VOLK machine that will be used", + volk_get_machine())); + our_options.add(option_t("alignment", "", "print the memory alignment", print_alignment)); + our_options.add(option_t("malloc", "", "print the malloc implementation used in volk_malloc", + print_malloc)); + our_options.add(option_t("version", "v", "print the VOLK version", volk_version())); - // You don't want to change the volk_malloc code, so just copy the if/else - // structure from there and give an explanation for the implementations - if(vm.count("malloc")) { - std::cout << "Used malloc implementation: "; -#if _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN - std::cout << "posix_memalign" << std::endl; -#elif _MSC_VER >= 1400 - std::cout << "aligned_malloc" << std::endl; -#else - std::cout << "No standard handler available, using own implementation." << std::endl; -#endif - } + our_options.parse(argc, argv); return 0; } diff -Nru volk-1.3/apps/volk_option_helpers.cc volk-1.4/apps/volk_option_helpers.cc --- volk-1.3/apps/volk_option_helpers.cc 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/apps/volk_option_helpers.cc 2018-03-26 22:52:55.000000000 +0000 @@ -0,0 +1,187 @@ +// +// Created by nathan on 2/1/18. +// + +#include "volk_option_helpers.h" + +#include // for exception +#include // for operator<<, endl, basic_ostream, cout, ostream +#include // for pair +#include // IWYU pragma: keep +#include // IWYU pragma: keep +#include // IWYU pragma: keep + +/* + * Option type + */ +option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)()) + : longform("--" + longform), + shortform("-" + shortform), + msg(msg), + callback(callback) { option_type = VOID_CALLBACK; } + +option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(int)) + : longform("--" + longform), + shortform("-" + shortform), + msg(msg), + callback((void (*)()) callback) { option_type = INT_CALLBACK; } + +option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(float)) + : longform("--" + longform), + shortform("-" + shortform), + msg(msg), + callback((void (*)()) callback) { option_type = FLOAT_CALLBACK; } + +option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(bool)) + : longform("--" + longform), + shortform("-" + shortform), + msg(msg), + callback((void (*)()) callback) { option_type = BOOL_CALLBACK; } + +option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(std::string)) + : longform("--" + longform), + shortform("-" + shortform), + msg(msg), + callback((void (*)()) callback) { option_type = STRING_CALLBACK; } + +option_t::option_t(std::string longform, std::string shortform, std::string msg, std::string printval) + : longform("--" + longform), + shortform("-" + shortform), + msg(msg), + printval(printval) { option_type = STRING; } + + +/* + * Option List + */ + +option_list::option_list(std::string program_name) : + program_name(program_name) { + internal_list = std::vector(); +} + + +void option_list::add(option_t opt) { internal_list.push_back(opt); } + +void option_list::parse(int argc, char **argv) { + for (int arg_number = 0; arg_number < argc; ++arg_number) { + for (std::vector::iterator this_option = internal_list.begin(); + this_option != internal_list.end(); + this_option++) { + int int_val = INT_MIN; + if (this_option->longform == std::string(argv[arg_number]) || + this_option->shortform == std::string(argv[arg_number])) { + + if (present_options.count(this_option->longform) == 0) { + present_options.insert(std::pair(this_option->longform, 1)); + } else { + present_options[this_option->longform] += 1; + } + switch (this_option->option_type) { + case VOID_CALLBACK: + this_option->callback(); + break; + case INT_CALLBACK: + try { + int_val = atoi(argv[++arg_number]); + ((void (*)(int)) this_option->callback)(int_val); + } catch (std::exception &exc) { + std::cout << "An int option can only receive a number" << std::endl; + throw std::exception(); + }; + break; + case FLOAT_CALLBACK: + try { + double double_val = atof(argv[++arg_number]); + ((void (*)(float)) this_option->callback)(double_val); + } catch (std::exception &exc) { + std::cout << "A float option can only receive a number" << std::endl; + throw std::exception(); + }; + break; + case BOOL_CALLBACK: + try { + if (arg_number == (argc - 1)) { // this is the last arg + int_val = 1; + } else { // sneak a look at the next arg since it's present + char *next_arg = argv[arg_number + 1]; + if ((strncmp(next_arg, "-", 1) == 0) || (strncmp(next_arg, "--", 2) == 0)) { + // the next arg is actually an arg, the bool is just present, set to true + int_val = 1; + } else if (strncmp(next_arg, "true", 4) == 0) { + int_val = 1; + } else if (strncmp(next_arg, "false", 5) == 0) { + int_val = 0; + } else { + // we got a number or a string. + // convert it to a number and depend on the catch to report an error condition + int_val = (bool) atoi(argv[++arg_number]); + } + } + } catch (std::exception &e) { + int_val = INT_MIN; + }; + if (int_val == INT_MIN) { + std::cout << "option: '" << argv[arg_number - 1] << "' -> received an unknown value. Boolean " + "options should receive one of '0', '1', 'true', 'false'." << std::endl; + throw std::exception(); + } else if (int_val) { + ((void (*)(bool)) this_option->callback)(int_val); + } + break; + case STRING_CALLBACK: + try { + ((void (*)(std::string)) this_option->callback)(argv[++arg_number]); + } catch (std::exception &exc) { + throw std::exception(); + }; + case STRING: + std::cout << this_option->printval << std::endl; + break; + } + } + + } + if (std::string("--help") == std::string(argv[arg_number]) || + std::string("-h") == std::string(argv[arg_number])) { + present_options.insert(std::pair("--help", 1)); + help(); + } + } +} + +bool option_list::present(std::string option_name) { + if (present_options.count("--" + option_name)) { + return true; + } else { + return false; + } +} + +void option_list::help() { + std::cout << program_name << std::endl; + std::cout << " -h [ --help ] \t\tdisplay this help message" << std::endl; + for (std::vector::iterator this_option = internal_list.begin(); + this_option != internal_list.end(); + this_option++) { + std::string help_line(" "); + if (this_option->shortform == "-") { + help_line += this_option->longform + " "; + } else { + help_line += this_option->shortform + " [ " + this_option->longform + " ]"; + } + + switch (help_line.size() / 8) { + case 0: + help_line += "\t"; + case 1: + help_line += "\t"; + case 2: + help_line += "\t"; + case 3: + help_line += "\t"; + } + help_line += this_option->msg; + std::cout << help_line << std::endl; + } +} diff -Nru volk-1.3/apps/volk_option_helpers.h volk-1.4/apps/volk_option_helpers.h --- volk-1.3/apps/volk_option_helpers.h 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/apps/volk_option_helpers.h 2018-03-26 22:52:55.000000000 +0000 @@ -0,0 +1,60 @@ +// +// Created by nathan on 2/1/18. +// + +#ifndef VOLK_VOLK_OPTION_HELPERS_H +#define VOLK_VOLK_OPTION_HELPERS_H + +#include +#include +#include +#include +#include + +typedef enum +{ + VOID_CALLBACK, + INT_CALLBACK, + BOOL_CALLBACK, + STRING_CALLBACK, + FLOAT_CALLBACK, + STRING, +} VOLK_OPTYPE; + +class option_t { + public: + option_t(std::string longform, std::string shortform, std::string msg, void (*callback)()); + option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(int)); + option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(float)); + option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(bool)); + option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(std::string)); + option_t(std::string longform, std::string shortform, std::string msg, std::string printval); + + std::string longform; + std::string shortform; + std::string msg; + VOLK_OPTYPE option_type; + std::string printval; + void (*callback)(); + +}; + +class option_list +{ + public: + option_list(std::string program_name); + bool present(std::string option_name); + + void add(option_t opt); + + void parse(int argc, char **argv); + + void help(); + private: + std::string program_name; + std::vector internal_list; + std::map present_options; +}; + + +#endif //VOLK_VOLK_OPTION_HELPERS_H diff -Nru volk-1.3/apps/volk_profile.cc volk-1.4/apps/volk_profile.cc --- volk-1.3/apps/volk_profile.cc 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/apps/volk_profile.cc 2018-03-26 22:52:55.000000000 +0000 @@ -20,166 +20,96 @@ * Boston, MA 02110-1301, USA. */ -#include "qa_utils.h" -#include "kernel_tests.h" +#include // for create_directories, exists +#include // for path, operator<< +#include // for filesystem +#include // for size_t +#include // for stat +#include // for volk_get_config_path +#include // for operator<<, basic_ostream +#include // IWYU pragma: keep +#include // for map, map<>::iterator +#include // for pair +#include // for vector, vector<>::const_... + +#include "kernel_tests.h" // for init_test_list +#include "qa_utils.h" // for volk_test_results_t, vol... +#include "volk/volk_complex.h" // for lv_32fc_t +#include "volk_option_helpers.h" // for option_list, option_t #include "volk_profile.h" -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include namespace fs = boost::filesystem; +volk_test_params_t test_params(1e-6f, 327.f, 131071, 1987, false, ""); + +void set_benchmark(bool val) { test_params.set_benchmark(val); } +void set_tolerance(float val) { test_params.set_tol(val); } +void set_vlen(int val) { test_params.set_vlen((unsigned int)val); } +void set_iter(int val) { test_params.set_iter((unsigned int)val); } +void set_substr(std::string val) { test_params.set_regex(val); } +bool update_mode = false; +void set_update(bool val) { update_mode = val; } +bool dry_run = false; +void set_dryrun(bool val) { dry_run = val; } +std::string json_filename(""); +void set_json(std::string val) { json_filename = val; } +std::string volk_config_path(""); +void set_volk_config(std::string val) { volk_config_path = val; } + int main(int argc, char *argv[]) { + + option_list profile_options("volk_profile"); + profile_options.add(option_t("benchmark", "b", "Run all kernels (benchmark mode)", set_benchmark)); + profile_options.add(option_t("tol", "t", "Set the default tolerance for all tests", set_tolerance)); + profile_options.add(option_t("vlen", "v", "Set the default vector length for tests", set_vlen)); + profile_options.add((option_t("iter", "i", "Set the default number of test iterations per kernel", set_iter))); + profile_options.add((option_t("tests-substr", "R", "Run tests matching substring", set_substr))); + profile_options.add((option_t("update", "u", "Run only kernels missing from config", set_update))); + profile_options.add((option_t("dry-run", "n", "Dry run. Respect other options, but don't write to file", set_dryrun))); + profile_options.add((option_t("json", "j", "Write results to JSON file named as argument value", set_json))); + profile_options.add((option_t("path", "p", "Specify the volk_config path", set_volk_config))); + profile_options.parse(argc, argv); + + if (profile_options.present("help")) { + return 0; + } + + if(dry_run) { + std::cout << "Warning: this IS a dry-run. Config will not be written!" << std::endl; + } + // Adding program options - boost::program_options::options_description desc("Options"); - desc.add_options() - ("help,h", "Print help messages") - ("benchmark,b", - boost::program_options::value()->default_value( false ) - ->implicit_value( true ), - "Run all kernels (benchmark mode)") - ("tol,t", - boost::program_options::value()->default_value( 1e-6 ), - "Set the default error tolerance for tests") - ("vlen,v", - boost::program_options::value()->default_value( 131071 ), - "Set the default vector length for tests") // default is a mersenne prime - ("iter,i", - boost::program_options::value()->default_value( 1987 ), - "Set the default number of test iterations per kernel") - ("tests-regex,R", - boost::program_options::value(), - "Run tests matching regular expression.") - ("update,u", - boost::program_options::value()->default_value( false ) - ->implicit_value( true ), - "Run only kernels missing from config; use -R to further restrict the candidates") - ("dry-run,n", - boost::program_options::value()->default_value( false ) - ->implicit_value( true ), - "Dry run. Respect other options, but don't write to file") - ("json,j", - boost::program_options::value(), - "JSON output file") - ("path,p", - boost::program_options::value(), - "Specify volk_config path.") - ; - - // Handle the options that were given - boost::program_options::variables_map vm; - bool benchmark_mode; - std::string kernel_regex; std::ofstream json_file; - float def_tol; - lv_32fc_t def_scalar; - int def_iter; - int def_vlen; - bool def_benchmark_mode; - std::string def_kernel_regex; - bool update_mode = false; - bool dry_run = false; std::string config_file; - // Handle the provided options - try { - boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), vm); - boost::program_options::notify(vm); - benchmark_mode = vm.count("benchmark")?vm["benchmark"].as():false; - if ( vm.count("tests-regex" ) ) { - kernel_regex = vm["tests-regex"].as(); - } - else { - kernel_regex = ".*"; - } - - def_tol = vm["tol"].as(); - def_scalar = 327.0; - def_vlen = vm["vlen"].as(); - def_iter = vm["iter"].as(); - def_benchmark_mode = benchmark_mode; - def_kernel_regex = kernel_regex; - update_mode = vm["update"].as(); - dry_run = vm["dry-run"].as(); - } - catch (boost::program_options::error& error) { - std::cerr << "Error: " << error.what() << std::endl << std::endl; - std::cerr << desc << std::endl; - return 1; - } - - /** --help option */ - if ( vm.count("help") ) { - std::cout << "The VOLK profiler." << std::endl - << desc << std::endl; - return 0; - } - - if ( vm.count("json") ) { - std::string filename; - try { - filename = vm["json"].as(); - } - catch (boost::bad_any_cast& error) { - std::cerr << error.what() << std::endl; - return 1; - } - json_file.open( filename.c_str() ); + if ( json_filename != "" ) { + json_file.open( json_filename.c_str() ); } - if ( vm.count("path") ) { - try { - config_file = vm["path"].as() + "/volk_config"; - } - catch (boost::bad_any_cast& error) { - std::cerr << error.what() << std::endl; - return 1; - } + if ( volk_config_path != "" ) { + config_file = volk_config_path + "/volk_config"; } - volk_test_params_t test_params(def_tol, def_scalar, def_vlen, def_iter, - def_benchmark_mode, def_kernel_regex); - // Run tests std::vector results; if(update_mode) { - if( vm.count("path") ) read_results(&results, config_file); + if( config_file != "" ) read_results(&results, config_file); else read_results(&results); } - // Initialize the list of tests - // the default test parameters come from options std::vector test_cases = init_test_list(test_params); - boost::xpressive::sregex kernel_expression; - try { - kernel_expression = boost::xpressive::sregex::compile(kernel_regex); - } - catch (boost::xpressive::regex_error& error) { - std::cerr << "Error occured while compiling regex" << std::endl << std::endl; - return 1; - } - // Iteratate through list of tests running each one + // Iterate through list of tests running each one + std::string substr_to_match(test_params.kernel_regex()); for(unsigned int ii = 0; ii < test_cases.size(); ++ii) { bool regex_match = true; volk_test_case_t test_case = test_cases[ii]; // if the kernel name matches regex then do the test - if(boost::xpressive::regex_search(test_case.name(), kernel_expression)) { - regex_match = true; - } - else { + std::string test_case_name = test_case.name(); + if(test_case_name.find(substr_to_match) == std::string::npos) { regex_match = false; } @@ -201,22 +131,21 @@ run_volk_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(), test_case.test_parameters(), &results, test_case.puppet_master_name()); } - catch (std::string error) { + catch (std::string &error) { std::cerr << "Caught Exception in 'run_volk_tests': " << error << std::endl; } - } } // Output results according to provided options - if(vm.count("json")) { + if(json_filename != "") { write_json(json_file, results); json_file.close(); } if(!dry_run) { - if(vm.count("path")) write_results(&results, false, config_file); + if(config_file != "") write_results(&results, false, config_file); else write_results(&results, false); } else { @@ -234,11 +163,12 @@ void read_results(std::vector *results, std::string path) { - const fs::path config_path(path); + struct stat buffer; + bool config_status = (stat (path.c_str(), &buffer) == 0); - if(fs::exists(config_path)) { + if( config_status ) { // a config exists and we are reading results from it - std::ifstream config(config_path.string().c_str()); + std::ifstream config(path.c_str()); char config_line[256]; while(config.getline(config_line, 255)) { // tokenize the input line by kernel_name unaligned aligned @@ -249,10 +179,10 @@ std::size_t str_size = config_str.size(); std::size_t found = 1; - found = config_str.find(" "); + found = config_str.find(' '); // Split line by spaces while(found && found < str_size) { - found = config_str.find(" "); + found = config_str.find(' '); // kernel names MUST be less than 128 chars, which is // a length restricted by volk/volk_prefs.c // on the last token in the parsed string we won't find a space @@ -261,7 +191,7 @@ found = 127; } str_size = config_str.size(); - char buffer[128]; + char buffer[128] = {'\0'}; config_str.copy(buffer, found + 1, 0); buffer[found] = '\0'; single_kernel_result.push_back(std::string(buffer)); @@ -278,7 +208,6 @@ } } } - } void write_results(const std::vector *results, bool update_result) @@ -291,11 +220,14 @@ void write_results(const std::vector *results, bool update_result, const std::string path) { - const fs::path config_path(path); +// struct stat buffer; +// bool config_status = (stat (path.c_str(), &buffer) == 0); - // Until we can update the config on a kernel by kernel basis - // do not overwrite volk_config when using a regex. - if (not fs::exists(config_path.branch_path())) + /* + * These + */ + const fs::path config_path(path); + if (! fs::exists(config_path.branch_path())) { std::cout << "Creating " << config_path.branch_path() << "..." << std::endl; fs::create_directories(config_path.branch_path()); @@ -303,17 +235,17 @@ std::ofstream config; if(update_result) { - std::cout << "Updating " << config_path << "..." << std::endl; - config.open(config_path.string().c_str(), std::ofstream::app); + std::cout << "Updating " << path << "..." << std::endl; + config.open(path.c_str(), std::ofstream::app); if (!config.is_open()) { //either we don't have write access or we don't have the dir yet - std::cout << "Error opening file " << config_path << std::endl; + std::cout << "Error opening file " << path << std::endl; } } else { - std::cout << "Writing " << config_path << "..." << std::endl; - config.open(config_path.string().c_str()); + std::cout << "Writing " << path << "..." << std::endl; + config.open(path.c_str()); if (!config.is_open()) { //either we don't have write access or we don't have the dir yet - std::cout << "Error opening file " << config_path << std::endl; + std::cout << "Error opening file " << path << std::endl; } config << "\ diff -Nru volk-1.3/apps/volk_profile.h volk-1.4/apps/volk_profile.h --- volk-1.3/apps/volk_profile.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/apps/volk_profile.h 2018-03-26 22:52:55.000000000 +0000 @@ -1,5 +1,12 @@ +#include // for bool +#include // for ofstream +#include // for string +#include // for vector + +class volk_test_results_t; + void read_results(std::vector *results); void read_results(std::vector *results, std::string path); void write_results(const std::vector *results, bool update_result); diff -Nru volk-1.3/appveyor.yml volk-1.4/appveyor.yml --- volk-1.3/appveyor.yml 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/appveyor.yml 2018-03-26 22:52:55.000000000 +0000 @@ -0,0 +1,55 @@ +clone_depth: 1 + +os: Visual Studio 2013 + +install: + - echo "Installing Boost libraries..." + - nuget install boost_system-vc120 + - nuget install boost_filesystem-vc120 + - nuget install boost_chrono-vc120 + - nuget install boost_program_options-vc120 + - nuget install boost_unit_test_framework-vc120 + + - echo "Installing Cheetah templates..." + - appveyor DownloadFile https://pypi.python.org/packages/source/C/Cheetah/Cheetah-2.4.4.tar.gz + - 7z x Cheetah-2.4.4.tar.gz + - 7z x -y Cheetah-2.4.4.tar + - cd Cheetah-2.4.4 + - c:\Python27\python.exe setup.py build + - c:\Python27\python.exe setup.py install + +build_script: + - cd c:\projects\volk + + # Without this directory in the %PATH%, compiler tests fail because of missing DLLs + - set PATH=%PATH%;C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin + + - cmake -G "Visual Studio 12 Win64" \ + -DBoost_CHRONO_LIBRARY_RELEASE:FILEPATH=c:/projects/volk/boost_chrono-vc120.1.59.0.0/lib/native/address-model-64/lib/boost_chrono-vc120-mt-1_59.lib \ + -DBoost_FILESYSTEM_LIBRARY_RELEASE:FILEPATH=c:/projects/volk/boost_filesystem-vc120.1.59.0.0/lib/native/address-model-64/lib/boost_filesystem-vc120-mt-1_59.lib \ + -DBoost_PROGRAM_OPTIONS_LIBRARY_RELEASE:FILEPATH=c:/projects/volk/boost_program_options-vc120.1.59.0.0/lib/native/address-model-64/lib/boost_program_options-vc120-mt-1_59.lib \ + -DBoost_SYSTEM_LIBRARY_RELEASE:FILEPATH=c:/projects/volk/boost_system-vc120.1.59.0.0/lib/native/address-model-64/lib/boost_system-vc120-mt-1_59.lib \ + -DBoost_UNIT_TEST_FRAMEWORK_LIBRARY_RELEASE:FILEPATH=c:/projects/volk/boost_unit_test_framework-vc120.1.59.0.0/lib/native/address-model-64/lib/boost_unit_test_framework-vc120-mt-1_59.lib \ + -DBoost_INCLUDE_DIR:PATH=c:/projects/volk/boost.1.59.0.0/lib/native/include \ + -DCMAKE_BUILD_TYPE:STRING=Release -DENABLE_ORC:BOOL=OFF -DENABLE_TESTING:BOOL=OFF \ + . + + - cmake --build . --config Release --target INSTALL + + # Create an archive + - cd "c:\Program Files" + - 7z a "c:\libvolk-x64.zip" volk + + # Create the deps archive + - mkdir dlls + - copy c:\projects\volk\boost_chrono-vc120.1.59.0.0\lib\native\address-model-64\lib\boost_chrono-vc120-mt-1_59.dll dlls\boost_chrono-vc120-mt-1_59.dll + - copy c:\projects\volk\boost_filesystem-vc120.1.59.0.0\lib\native\address-model-64\lib\boost_filesystem-vc120-mt-1_59.dll dlls\boost_filesystem-vc120-mt-1_59.dll + - copy c:\projects\volk\boost_program_options-vc120.1.59.0.0\lib\native\address-model-64\lib\boost_program_options-vc120-mt-1_59.dll dlls\boost_program_options-vc120-mt-1_59.dll + - copy c:\projects\volk\boost_system-vc120.1.59.0.0\lib\native\address-model-64\lib\boost_system-vc120-mt-1_59.dll dlls\boost_system-vc120-mt-1_59.dll + - copy c:\projects\volk\boost_unit_test_framework-vc120.1.59.0.0\lib\native\address-model-64\lib\boost_unit_test_framework-vc120-mt-1_59.dll dlls\boost_unit_test_framework-vc120-mt-1_59.dll + - cd dlls + - 7z a "c:\libvolk-x64-deps.zip" * + + # Push it! + - appveyor PushArtifact c:\libvolk-x64.zip + - appveyor PushArtifact c:\libvolk-x64-deps.zip diff -Nru volk-1.3/cmake/Modules/CMakeParseArgumentsCopy.cmake volk-1.4/cmake/Modules/CMakeParseArgumentsCopy.cmake --- volk-1.3/cmake/Modules/CMakeParseArgumentsCopy.cmake 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/cmake/Modules/CMakeParseArgumentsCopy.cmake 2018-03-26 22:52:55.000000000 +0000 @@ -58,7 +58,7 @@ # the new option. # E.g. my_install(TARGETS foo DESTINATION OPTIONAL) would result in # MY_INSTALL_DESTINATION set to "OPTIONAL", but MY_INSTALL_DESTINATION would -# be empty and MY_INSTALL_OPTIONAL would be set to TRUE therefor. +# be empty and MY_INSTALL_OPTIONAL would be set to TRUE therefore. #============================================================================= # Copyright 2010 Alexander Neundorf diff -Nru volk-1.3/cmake/Modules/VolkAddTest.cmake volk-1.4/cmake/Modules/VolkAddTest.cmake --- volk-1.3/cmake/Modules/VolkAddTest.cmake 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/cmake/Modules/VolkAddTest.cmake 2018-03-26 22:52:55.000000000 +0000 @@ -23,23 +23,39 @@ set(__INCLUDED_VOLK_ADD_TEST TRUE) ######################################################################## +# Generate a test executable which can be used in ADD_TEST to call +# various subtests. +# +# SOURCES - sources for the test +# TARGET_DEPS - build target dependencies (e.g., libraries) +######################################################################## + +function(VOLK_GEN_TEST executable_name) + include(CMakeParseArgumentsCopy) + CMAKE_PARSE_ARGUMENTS(VOLK_TEST "" "" "SOURCES;TARGET_DEPS;EXTRA_LIB_DIRS;ENVIRONS;ARGS" ${ARGN}) + add_executable(${executable_name} ${VOLK_TEST_SOURCES}) + target_link_libraries(${executable_name} ${VOLK_TEST_TARGET_DEPS}) +endfunction() + +######################################################################## # Add a unit test and setup the environment for it. # Encloses ADD_TEST, with additional functionality to create a shell # script that sets the environment to gain access to in-build binaries # properly. The following variables are used to pass in settings: +# A test executable has to be generated with VOLK_GEN_TEST beforehand. +# The executable name has to be passed as argument. # # NAME - the test name -# SOURCES - sources for the test # TARGET_DEPS - build target dependencies (e.g., libraries) # EXTRA_LIB_DIRS - other directories for the library path # ENVIRONS - other environment key/value pairs # ARGS - arguments for the test ######################################################################## -function(VOLK_ADD_TEST test_name) +function(VOLK_ADD_TEST test_name executable_name) #parse the arguments for component names include(CMakeParseArgumentsCopy) - CMAKE_PARSE_ARGUMENTS(VOLK_TEST "" "" "SOURCES;TARGET_DEPS;EXTRA_LIB_DIRS;ENVIRONS;ARGS" ${ARGN}) + CMAKE_PARSE_ARGUMENTS(VOLK_TEST "" "" "TARGET_DEPS;EXTRA_LIB_DIRS;ENVIRONS;ARGS" ${ARGN}) #set the initial environs to use set(environs ${VOLK_TEST_ENVIRONS}) @@ -65,7 +81,7 @@ #"add_test" command, via the $ operator; make sure the #test's directory is first, since it ($1) is prepended to PATH. unset(TARGET_DIR_LIST) - foreach(target ${test_name} ${VOLK_TEST_TARGET_DEPS}) + foreach(target ${executable_name} ${VOLK_TEST_TARGET_DEPS}) list(APPEND TARGET_DIR_LIST "\$") endforeach() @@ -134,18 +150,17 @@ file(APPEND ${sh_file} "export ${environ}\n") endforeach(environ) + set(VOLK_TEST_ARGS "${test_name}") + #redo the test args to have a space between each string(REPLACE ";" " " VOLK_TEST_ARGS "${VOLK_TEST_ARGS}") #finally: append the test name to execute - file(APPEND ${sh_file} ${test_name} " " ${VOLK_TEST_ARGS} "\n") + file(APPEND ${sh_file} "${CMAKE_CROSSCOMPILING_EMULATOR} ${executable_name} ${VOLK_TEST_ARGS}\n") #make the shell file executable execute_process(COMMAND chmod +x ${sh_file}) - add_executable(${test_name} ${VOLK_TEST_SOURCES}) - target_link_libraries(${test_name} ${VOLK_TEST_TARGET_DEPS}) - #add the shell file as the test to execute; #use the form that allows for $ substitutions, #then combine the script arguments inside the script. @@ -196,9 +211,6 @@ file(APPEND ${bat_file} ${test_name} " " ${VOLK_TEST_ARGS} "\n") file(APPEND ${bat_file} "\n") - add_executable(${test_name} ${VOLK_TEST_SOURCES}) - target_link_libraries(${test_name} ${VOLK_TEST_TARGET_DEPS}) - add_test(${test_name} ${bat_file}) endif(WIN32) diff -Nru volk-1.3/cmake/Modules/VolkBoost.cmake volk-1.4/cmake/Modules/VolkBoost.cmake --- volk-1.3/cmake/Modules/VolkBoost.cmake 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/cmake/Modules/VolkBoost.cmake 2018-03-26 22:52:55.000000000 +0000 @@ -29,8 +29,6 @@ set(BOOST_REQUIRED_COMPONENTS filesystem system - unit_test_framework - program_options ) if(UNIX AND NOT BOOST_ROOT AND EXISTS "/usr/lib64") diff -Nru volk-1.3/cmake/Modules/VolkBuildTypes.cmake volk-1.4/cmake/Modules/VolkBuildTypes.cmake --- volk-1.3/cmake/Modules/VolkBuildTypes.cmake 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/cmake/Modules/VolkBuildTypes.cmake 2018-03-26 22:52:55.000000000 +0000 @@ -29,7 +29,7 @@ # - RelWithDebInfo: -O3 -g # - MinSizeRel: -Os -# Addtional Build Types, defined below: +# Additional Build Types, defined below: # - NoOptWithASM: -O0 -g -save-temps # - O2WithASM: -O2 -g -save-temps # - O3WithASM: -O3 -g -save-temps diff -Nru volk-1.3/cmake/Modules/VolkPython.cmake volk-1.4/cmake/Modules/VolkPython.cmake --- volk-1.3/cmake/Modules/VolkPython.cmake 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/cmake/Modules/VolkPython.cmake 2018-03-26 22:52:55.000000000 +0000 @@ -36,11 +36,12 @@ else(PYTHON_EXECUTABLE) #use the built-in find script + set(Python_ADDITIONAL_VERSIONS 3.4 3.5 3.6) find_package(PythonInterp 2) #and if that fails use the find program routine if(NOT PYTHONINTERP_FOUND) - find_program(PYTHON_EXECUTABLE NAMES python python2 python2.7 python2.6 python2.5) + find_program(PYTHON_EXECUTABLE NAMES python python2 python2.7 python3) if(PYTHON_EXECUTABLE) set(PYTHONINTERP_FOUND TRUE) endif(PYTHON_EXECUTABLE) @@ -100,7 +101,7 @@ if(NOT DEFINED VOLK_PYTHON_DIR) execute_process(COMMAND ${PYTHON_EXECUTABLE} -c " from distutils import sysconfig -print sysconfig.get_python_lib(plat_specific=True, prefix='') +print(sysconfig.get_python_lib(plat_specific=True, prefix='')) " OUTPUT_VARIABLE VOLK_PYTHON_DIR OUTPUT_STRIP_TRAILING_WHITESPACE ) endif() @@ -113,7 +114,7 @@ function(VOLK_UNIQUE_TARGET desc) file(RELATIVE_PATH reldir ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}) execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import re, hashlib -unique = hashlib.md5('${reldir}${ARGN}').hexdigest()[:5] +unique = hashlib.md5(b'${reldir}${ARGN}').hexdigest()[:5] print(re.sub('\\W', '_', '${desc} ${reldir} ' + unique))" OUTPUT_VARIABLE _target OUTPUT_STRIP_TRAILING_WHITESPACE) add_custom_target(${_target} ALL DEPENDS ${ARGN}) @@ -230,7 +231,7 @@ file(WRITE ${CMAKE_BINARY_DIR}/python_compile_helper.py " import sys, py_compile files = sys.argv[1:] -srcs, gens = files[:len(files)/2], files[len(files)/2:] +srcs, gens = files[:len(files)//2], files[len(files)//2:] for src, gen in zip(srcs, gens): py_compile.compile(file=src, cfile=gen, doraise=True) ") diff -Nru volk-1.3/cmake/msvc/config.h volk-1.4/cmake/msvc/config.h --- volk-1.3/cmake/msvc/config.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/cmake/msvc/config.h 2018-03-26 22:52:55.000000000 +0000 @@ -21,6 +21,7 @@ //////////////////////////////////////////////////////////////////////// // rint functions //////////////////////////////////////////////////////////////////////// +#if _MSC_VER < 1800 #include static inline long lrint(double x){return (long)(x > 0.0 ? x + 0.5 : x - 0.5);} static inline long lrintf(float x){return (long)(x > 0.0f ? x + 0.5f : x - 0.5f);} @@ -28,25 +29,15 @@ static inline long long llrintf(float x){return (long long)(x > 0.0f ? x + 0.5f : x - 0.5f);} static inline double rint(double x){return (x > 0.0)? floor(x + 0.5) : ceil(x - 0.5);} static inline float rintf(float x){return (x > 0.0f)? floorf(x + 0.5f) : ceilf(x - 0.5f);} +#endif //////////////////////////////////////////////////////////////////////// // math constants //////////////////////////////////////////////////////////////////////// +#if _MSC_VER < 1800 +#include #define INFINITY HUGE_VAL - -# define M_E 2.7182818284590452354 /* e */ -# define M_LOG2E 1.4426950408889634074 /* log_2 e */ -# define M_LOG10E 0.43429448190325182765 /* log_10 e */ -# define M_LN2 0.69314718055994530942 /* log_e 2 */ -# define M_LN10 2.30258509299404568402 /* log_e 10 */ -# define M_PI 3.14159265358979323846 /* pi */ -# define M_PI_2 1.57079632679489661923 /* pi/2 */ -# define M_PI_4 0.78539816339744830962 /* pi/4 */ -# define M_1_PI 0.31830988618379067154 /* 1/pi */ -# define M_2_PI 0.63661977236758134308 /* 2/pi */ -# define M_2_SQRTPI 1.12837916709551257390 /* 2/sqrt(pi) */ -# define M_SQRT2 1.41421356237309504880 /* sqrt(2) */ -# define M_SQRT1_2 0.70710678118654752440 /* 1/sqrt(2) */ +#endif //////////////////////////////////////////////////////////////////////// // random and srandom diff -Nru volk-1.3/cmake/msvc/inttypes.h volk-1.4/cmake/msvc/inttypes.h --- volk-1.3/cmake/msvc/inttypes.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/cmake/msvc/inttypes.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,301 +0,0 @@ -// ISO C9x compliant inttypes.h for Microsoft Visual Studio -// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 -// -// Copyright (c) 2006 Alexander Chemeris -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. The name of the author may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED -// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO -// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR -// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF -// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -/////////////////////////////////////////////////////////////////////////////// - -#ifndef _MSC_VER // [ -#error "Use this header only with Microsoft Visual C++ compilers!" -#endif // _MSC_VER ] - -#ifndef _MSC_INTTYPES_H_ // [ -#define _MSC_INTTYPES_H_ - -#if _MSC_VER > 1000 -#pragma once -#endif - -#include - -// 7.8 Format conversion of integer types - -typedef struct { - intmax_t quot; - intmax_t rem; -} imaxdiv_t; - -// 7.8.1 Macros for format specifiers - -// The fprintf macros for signed integers are: -#define PRId8 "d" -#define PRIi8 "i" -#define PRIdLEAST8 "d" -#define PRIiLEAST8 "i" -#define PRIdFAST8 "d" -#define PRIiFAST8 "i" - -#define PRId16 "hd" -#define PRIi16 "hi" -#define PRIdLEAST16 "hd" -#define PRIiLEAST16 "hi" -#define PRIdFAST16 "hd" -#define PRIiFAST16 "hi" - -#define PRId32 "I32d" -#define PRIi32 "I32i" -#define PRIdLEAST32 "I32d" -#define PRIiLEAST32 "I32i" -#define PRIdFAST32 "I32d" -#define PRIiFAST32 "I32i" - -#define PRId64 "I64d" -#define PRIi64 "I64i" -#define PRIdLEAST64 "I64d" -#define PRIiLEAST64 "I64i" -#define PRIdFAST64 "I64d" -#define PRIiFAST64 "I64i" - -#define PRIdMAX "I64d" -#define PRIiMAX "I64i" - -#define PRIdPTR "Id" -#define PRIiPTR "Ii" - -// The fprintf macros for unsigned integers are: -#define PRIo8 "o" -#define PRIu8 "u" -#define PRIx8 "x" -#define PRIX8 "X" -#define PRIoLEAST8 "o" -#define PRIuLEAST8 "u" -#define PRIxLEAST8 "x" -#define PRIXLEAST8 "X" -#define PRIoFAST8 "o" -#define PRIuFAST8 "u" -#define PRIxFAST8 "x" -#define PRIXFAST8 "X" - -#define PRIo16 "ho" -#define PRIu16 "hu" -#define PRIx16 "hx" -#define PRIX16 "hX" -#define PRIoLEAST16 "ho" -#define PRIuLEAST16 "hu" -#define PRIxLEAST16 "hx" -#define PRIXLEAST16 "hX" -#define PRIoFAST16 "ho" -#define PRIuFAST16 "hu" -#define PRIxFAST16 "hx" -#define PRIXFAST16 "hX" - -#define PRIo32 "I32o" -#define PRIu32 "I32u" -#define PRIx32 "I32x" -#define PRIX32 "I32X" -#define PRIoLEAST32 "I32o" -#define PRIuLEAST32 "I32u" -#define PRIxLEAST32 "I32x" -#define PRIXLEAST32 "I32X" -#define PRIoFAST32 "I32o" -#define PRIuFAST32 "I32u" -#define PRIxFAST32 "I32x" -#define PRIXFAST32 "I32X" - -#define PRIo64 "I64o" -#define PRIu64 "I64u" -#define PRIx64 "I64x" -#define PRIX64 "I64X" -#define PRIoLEAST64 "I64o" -#define PRIuLEAST64 "I64u" -#define PRIxLEAST64 "I64x" -#define PRIXLEAST64 "I64X" -#define PRIoFAST64 "I64o" -#define PRIuFAST64 "I64u" -#define PRIxFAST64 "I64x" -#define PRIXFAST64 "I64X" - -#define PRIoMAX "I64o" -#define PRIuMAX "I64u" -#define PRIxMAX "I64x" -#define PRIXMAX "I64X" - -#define PRIoPTR "Io" -#define PRIuPTR "Iu" -#define PRIxPTR "Ix" -#define PRIXPTR "IX" - -// The fscanf macros for signed integers are: -#define SCNd8 "d" -#define SCNi8 "i" -#define SCNdLEAST8 "d" -#define SCNiLEAST8 "i" -#define SCNdFAST8 "d" -#define SCNiFAST8 "i" - -#define SCNd16 "hd" -#define SCNi16 "hi" -#define SCNdLEAST16 "hd" -#define SCNiLEAST16 "hi" -#define SCNdFAST16 "hd" -#define SCNiFAST16 "hi" - -#define SCNd32 "ld" -#define SCNi32 "li" -#define SCNdLEAST32 "ld" -#define SCNiLEAST32 "li" -#define SCNdFAST32 "ld" -#define SCNiFAST32 "li" - -#define SCNd64 "I64d" -#define SCNi64 "I64i" -#define SCNdLEAST64 "I64d" -#define SCNiLEAST64 "I64i" -#define SCNdFAST64 "I64d" -#define SCNiFAST64 "I64i" - -#define SCNdMAX "I64d" -#define SCNiMAX "I64i" - -#ifdef _WIN64 // [ -# define SCNdPTR "I64d" -# define SCNiPTR "I64i" -#else // _WIN64 ][ -# define SCNdPTR "ld" -# define SCNiPTR "li" -#endif // _WIN64 ] - -// The fscanf macros for unsigned integers are: -#define SCNo8 "o" -#define SCNu8 "u" -#define SCNx8 "x" -#define SCNX8 "X" -#define SCNoLEAST8 "o" -#define SCNuLEAST8 "u" -#define SCNxLEAST8 "x" -#define SCNXLEAST8 "X" -#define SCNoFAST8 "o" -#define SCNuFAST8 "u" -#define SCNxFAST8 "x" -#define SCNXFAST8 "X" - -#define SCNo16 "ho" -#define SCNu16 "hu" -#define SCNx16 "hx" -#define SCNX16 "hX" -#define SCNoLEAST16 "ho" -#define SCNuLEAST16 "hu" -#define SCNxLEAST16 "hx" -#define SCNXLEAST16 "hX" -#define SCNoFAST16 "ho" -#define SCNuFAST16 "hu" -#define SCNxFAST16 "hx" -#define SCNXFAST16 "hX" - -#define SCNo32 "lo" -#define SCNu32 "lu" -#define SCNx32 "lx" -#define SCNX32 "lX" -#define SCNoLEAST32 "lo" -#define SCNuLEAST32 "lu" -#define SCNxLEAST32 "lx" -#define SCNXLEAST32 "lX" -#define SCNoFAST32 "lo" -#define SCNuFAST32 "lu" -#define SCNxFAST32 "lx" -#define SCNXFAST32 "lX" - -#define SCNo64 "I64o" -#define SCNu64 "I64u" -#define SCNx64 "I64x" -#define SCNX64 "I64X" -#define SCNoLEAST64 "I64o" -#define SCNuLEAST64 "I64u" -#define SCNxLEAST64 "I64x" -#define SCNXLEAST64 "I64X" -#define SCNoFAST64 "I64o" -#define SCNuFAST64 "I64u" -#define SCNxFAST64 "I64x" -#define SCNXFAST64 "I64X" - -#define SCNoMAX "I64o" -#define SCNuMAX "I64u" -#define SCNxMAX "I64x" -#define SCNXMAX "I64X" - -#ifdef _WIN64 // [ -# define SCNoPTR "I64o" -# define SCNuPTR "I64u" -# define SCNxPTR "I64x" -# define SCNXPTR "I64X" -#else // _WIN64 ][ -# define SCNoPTR "lo" -# define SCNuPTR "lu" -# define SCNxPTR "lx" -# define SCNXPTR "lX" -#endif // _WIN64 ] - -// 7.8.2 Functions for greatest-width integer types - -// 7.8.2.1 The imaxabs function -#define imaxabs _abs64 - -// 7.8.2.2 The imaxdiv function - -// This is modified version of div() function from Microsoft's div.c found -// in %MSVC.NET%\crt\src\div.c -#ifdef STATIC_IMAXDIV // [ -static -#else // STATIC_IMAXDIV ][ -_inline -#endif // STATIC_IMAXDIV ] -imaxdiv_t __cdecl imaxdiv(intmax_t numer, intmax_t denom) -{ - imaxdiv_t result; - - result.quot = numer / denom; - result.rem = numer % denom; - - if (numer < 0 && result.rem > 0) { - // did division wrong; must fix up - ++result.quot; - result.rem -= denom; - } - - return result; -} - -// 7.8.2.3 The strtoimax and strtoumax functions -#define strtoimax _strtoi64 -#define strtoumax _strtoui64 - -// 7.8.2.4 The wcstoimax and wcstoumax functions -#define wcstoimax _wcstoi64 -#define wcstoumax _wcstoui64 - - -#endif // _MSC_INTTYPES_H_ ] diff -Nru volk-1.3/cmake/msvc/stdbool.h volk-1.4/cmake/msvc/stdbool.h --- volk-1.3/cmake/msvc/stdbool.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/cmake/msvc/stdbool.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,45 +0,0 @@ -/* - * Copyright (C) 2005, 2006 Apple Computer, Inc. - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Library General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Library General Public License for more details. - * - * You should have received a copy of the GNU Library General Public License - * along with this library; see the file COPYING.LIB. If not, write to - * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, - * Boston, MA 02110-1301, USA. - * - */ - -#ifndef STDBOOL_WIN32_H -#define STDBOOL_WIN32_H - -#ifndef _MSC_VER // [ -#error "Use this header only with Microsoft Visual C++ compilers!" -#endif // _MSC_VER ] - -#ifndef __cplusplus - -typedef unsigned char bool; - -#define true 1 -#define false 0 - -#ifndef CASSERT -#define CASSERT(exp, name) typedef int dummy##name [(exp) ? 1 : -1]; -#endif - -CASSERT(sizeof(bool) == 1, bool_is_one_byte) -CASSERT(true, true_is_true) -CASSERT(!false, false_is_false) - -#endif - -#endif diff -Nru volk-1.3/cmake/msvc/stdint.h volk-1.4/cmake/msvc/stdint.h --- volk-1.3/cmake/msvc/stdint.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/cmake/msvc/stdint.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,251 +0,0 @@ -// ISO C9x compliant stdint.h for Microsoft Visual Studio -// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 -// -// Copyright (c) 2006-2008 Alexander Chemeris -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. The name of the author may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED -// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO -// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR -// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF -// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -/////////////////////////////////////////////////////////////////////////////// - -#ifndef _MSC_VER // [ -#error "Use this header only with Microsoft Visual C++ compilers!" -#endif // _MSC_VER ] - -#ifndef _MSC_STDINT_H_ // [ -#define _MSC_STDINT_H_ - -#if _MSC_VER > 1000 -#pragma once -#endif - -#include - -// For Visual Studio 6 in C++ mode and for many Visual Studio versions when -// compiling for ARM we should wrap include with 'extern "C++" {}' -// or compiler give many errors like this: -// error C2733: second C linkage of overloaded function 'wmemchr' not allowed -#ifdef __cplusplus -extern "C" { -#endif -# include -#ifdef __cplusplus -} -#endif - -// Define _W64 macros to mark types changing their size, like intptr_t. -#ifndef _W64 -# if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300 -# define _W64 __w64 -# else -# define _W64 -# endif -#endif - - -// 7.18.1 Integer types - -// 7.18.1.1 Exact-width integer types - -// Visual Studio 6 and Embedded Visual C++ 4 doesn't -// realize that, e.g. char has the same size as __int8 -// so we give up on __intX for them. -#if (_MSC_VER < 1300) - typedef signed char int8_t; - typedef signed short int16_t; - typedef signed int int32_t; - typedef unsigned char uint8_t; - typedef unsigned short uint16_t; - typedef unsigned int uint32_t; -#else - typedef signed __int8 int8_t; - typedef signed __int16 int16_t; - typedef signed __int32 int32_t; - typedef unsigned __int8 uint8_t; - typedef unsigned __int16 uint16_t; - typedef unsigned __int32 uint32_t; -#endif -typedef signed __int64 int64_t; -typedef unsigned __int64 uint64_t; - - -// 7.18.1.2 Minimum-width integer types -typedef int8_t int_least8_t; -typedef int16_t int_least16_t; -typedef int32_t int_least32_t; -typedef int64_t int_least64_t; -typedef uint8_t uint_least8_t; -typedef uint16_t uint_least16_t; -typedef uint32_t uint_least32_t; -typedef uint64_t uint_least64_t; - -// 7.18.1.3 Fastest minimum-width integer types -typedef int8_t int_fast8_t; -typedef int16_t int_fast16_t; -typedef int32_t int_fast32_t; -typedef int64_t int_fast64_t; -typedef uint8_t uint_fast8_t; -typedef uint16_t uint_fast16_t; -typedef uint32_t uint_fast32_t; -typedef uint64_t uint_fast64_t; - -// 7.18.1.4 Integer types capable of holding object pointers -#ifdef _WIN64 // [ - typedef signed __int64 intptr_t; - typedef unsigned __int64 uintptr_t; -#else // _WIN64 ][ - typedef _W64 signed int intptr_t; - typedef _W64 unsigned int uintptr_t; -#endif // _WIN64 ] - -// 7.18.1.5 Greatest-width integer types -typedef int64_t intmax_t; -typedef uint64_t uintmax_t; - - -// 7.18.2 Limits of specified-width integer types - -#if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [ See footnote 220 at page 257 and footnote 221 at page 259 - -// 7.18.2.1 Limits of exact-width integer types -#define INT8_MIN ((int8_t)_I8_MIN) -#define INT8_MAX _I8_MAX -#define INT16_MIN ((int16_t)_I16_MIN) -#define INT16_MAX _I16_MAX -#define INT32_MIN ((int32_t)_I32_MIN) -#define INT32_MAX _I32_MAX -#define INT64_MIN ((int64_t)_I64_MIN) -#define INT64_MAX _I64_MAX -#define UINT8_MAX _UI8_MAX -#define UINT16_MAX _UI16_MAX -#define UINT32_MAX _UI32_MAX -#define UINT64_MAX _UI64_MAX - -// 7.18.2.2 Limits of minimum-width integer types -#define INT_LEAST8_MIN INT8_MIN -#define INT_LEAST8_MAX INT8_MAX -#define INT_LEAST16_MIN INT16_MIN -#define INT_LEAST16_MAX INT16_MAX -#define INT_LEAST32_MIN INT32_MIN -#define INT_LEAST32_MAX INT32_MAX -#define INT_LEAST64_MIN INT64_MIN -#define INT_LEAST64_MAX INT64_MAX -#define UINT_LEAST8_MAX UINT8_MAX -#define UINT_LEAST16_MAX UINT16_MAX -#define UINT_LEAST32_MAX UINT32_MAX -#define UINT_LEAST64_MAX UINT64_MAX - -// 7.18.2.3 Limits of fastest minimum-width integer types -#define INT_FAST8_MIN INT8_MIN -#define INT_FAST8_MAX INT8_MAX -#define INT_FAST16_MIN INT16_MIN -#define INT_FAST16_MAX INT16_MAX -#define INT_FAST32_MIN INT32_MIN -#define INT_FAST32_MAX INT32_MAX -#define INT_FAST64_MIN INT64_MIN -#define INT_FAST64_MAX INT64_MAX -#define UINT_FAST8_MAX UINT8_MAX -#define UINT_FAST16_MAX UINT16_MAX -#define UINT_FAST32_MAX UINT32_MAX -#define UINT_FAST64_MAX UINT64_MAX - -// 7.18.2.4 Limits of integer types capable of holding object pointers -#ifdef _WIN64 // [ -# define INTPTR_MIN INT64_MIN -# define INTPTR_MAX INT64_MAX -# define UINTPTR_MAX UINT64_MAX -#else // _WIN64 ][ -# define INTPTR_MIN INT32_MIN -# define INTPTR_MAX INT32_MAX -# define UINTPTR_MAX UINT32_MAX -#endif // _WIN64 ] - -// 7.18.2.5 Limits of greatest-width integer types -#define INTMAX_MIN INT64_MIN -#define INTMAX_MAX INT64_MAX -#define UINTMAX_MAX UINT64_MAX - -// 7.18.3 Limits of other integer types - -#ifdef _WIN64 // [ -# define PTRDIFF_MIN _I64_MIN -# define PTRDIFF_MAX _I64_MAX -#else // _WIN64 ][ -# define PTRDIFF_MIN _I32_MIN -# define PTRDIFF_MAX _I32_MAX -#endif // _WIN64 ] - -#define SIG_ATOMIC_MIN INT_MIN -#define SIG_ATOMIC_MAX INT_MAX - -#ifndef SIZE_MAX // [ -# ifdef _WIN64 // [ -# define SIZE_MAX _UI64_MAX -# else // _WIN64 ][ -# define SIZE_MAX _UI32_MAX -# endif // _WIN64 ] -#endif // SIZE_MAX ] - -// WCHAR_MIN and WCHAR_MAX are also defined in -#ifndef WCHAR_MIN // [ -# define WCHAR_MIN 0 -#endif // WCHAR_MIN ] -#ifndef WCHAR_MAX // [ -# define WCHAR_MAX _UI16_MAX -#endif // WCHAR_MAX ] - -#define WINT_MIN 0 -#define WINT_MAX _UI16_MAX - -#endif // __STDC_LIMIT_MACROS ] - - -// 7.18.4 Limits of other integer types - -#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [ See footnote 224 at page 260 - -// 7.18.4.1 Macros for minimum-width integer constants - -#define INT8_C(val) val##i8 -#define INT16_C(val) val##i16 -#define INT32_C(val) val##i32 -#define INT64_C(val) val##i64 - -#define UINT8_C(val) val##ui8 -#define UINT16_C(val) val##ui16 -#define UINT32_C(val) val##ui32 -#define UINT64_C(val) val##ui64 - -// 7.18.4.2 Macros for greatest-width integer constants -#ifndef INTMAX_C -#define INTMAX_C INT64_C -#endif -#ifndef UINTMAX_C -#define UINTMAX_C UINT64_C -#endif - -#endif // __STDC_CONSTANT_MACROS ] - - -#endif // _MSC_STDINT_H_ ] diff -Nru volk-1.3/cmake/Toolchains/arm-linux-gnueabihf.cmake volk-1.4/cmake/Toolchains/arm-linux-gnueabihf.cmake --- volk-1.3/cmake/Toolchains/arm-linux-gnueabihf.cmake 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/cmake/Toolchains/arm-linux-gnueabihf.cmake 2018-03-26 22:52:55.000000000 +0000 @@ -0,0 +1,33 @@ +set(CMAKE_SYSTEM_NAME Linux) +set(CMAKE_SYSTEM_PROCESSOR ARM) + +if(MINGW OR CYGWIN OR WIN32) + set(UTIL_SEARCH_CMD where) +elseif(UNIX OR APPLE) + set(UTIL_SEARCH_CMD which) +endif() + +set(TOOLCHAIN_PREFIX arm-linux-gnueabihf-) + +execute_process( + COMMAND ${UTIL_SEARCH_CMD} ${TOOLCHAIN_PREFIX}gcc + OUTPUT_VARIABLE BINUTILS_PATH + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + +get_filename_component(ARM_TOOLCHAIN_DIR ${BINUTILS_PATH} DIRECTORY) + +# The following is not needed on debian +# Without that flag CMake is not able to pass test compilation check +#set(CMAKE_EXE_LINKER_FLAGS_INIT "--specs=nosys.specs") + +set(CMAKE_C_COMPILER ${TOOLCHAIN_PREFIX}gcc) +set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER}) +set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PREFIX}g++) + +set(CMAKE_OBJCOPY ${ARM_TOOLCHAIN_DIR}/${TOOLCHAIN_PREFIX}objcopy CACHE INTERNAL "objcopy tool") +set(CMAKE_SIZE_UTIL ${ARM_TOOLCHAIN_DIR}/${TOOLCHAIN_PREFIX}size CACHE INTERNAL "size tool") + +set(CMAKE_FIND_ROOT_PATH ${BINUTILS_PATH}) + +set(CMAKE_CROSSCOMPILING_EMULATOR "qemu-arm -L /usr/arm-linux-gnueabihf/") diff -Nru volk-1.3/CMakeLists.txt volk-1.4/CMakeLists.txt --- volk-1.3/CMakeLists.txt 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/CMakeLists.txt 2018-03-26 22:52:55.000000000 +0000 @@ -44,11 +44,10 @@ message(STATUS "Build type set to ${CMAKE_BUILD_TYPE}.") set(VERSION_INFO_MAJOR_VERSION 1) -set(VERSION_INFO_MINOR_VERSION 3) +set(VERSION_INFO_MINOR_VERSION 4) set(VERSION_INFO_MAINT_VERSION 0) include(VolkVersion) #setup version info - ######################################################################## # Environment setup ######################################################################## @@ -61,6 +60,14 @@ ENDIF() SET(CROSSCOMPILE_MULTILIB ${CROSSCOMPILE_MULTILIB} CACHE STRING "Define \"true\" if you have and want to use multiple C development libs installed for cross compile") +if(MSVC) + add_definitions(-D_USE_MATH_DEFINES) #enables math constants on all supported versions of MSVC + add_compile_options(/W1) #reduce warnings + add_compile_options(/wo4309) + add_compile_options(/wd4752) + add_compile_options(/wo4273) + add_compile_options(/wo4838) +endif(MSVC) ######################################################################## # Dependencies setup @@ -68,16 +75,22 @@ # Python include(VolkPython) #sets PYTHON_EXECUTABLE and PYTHON_DASH_B -VOLK_PYTHON_CHECK_MODULE("python >= 2.5" sys "sys.version.split()[0] >= '2.5'" PYTHON_MIN_VER_FOUND) -VOLK_PYTHON_CHECK_MODULE("Cheetah >= 2.0.0" Cheetah "Cheetah.Version >= '2.0.0'" CHEETAH_FOUND) +VOLK_PYTHON_CHECK_MODULE("python >= 2.7" sys "sys.version.split()[0] >= '2.7'" PYTHON_MIN_VER_FOUND) +VOLK_PYTHON_CHECK_MODULE("mako >= 0.4.2" mako "mako.__version__ >= '0.4.2'" MAKO_FOUND) +VOLK_PYTHON_CHECK_MODULE("six - python 2 and 3 compatibility library" six "True" SIX_FOUND) if(NOT PYTHON_MIN_VER_FOUND) - message(FATAL_ERROR "Python 2.5 or greater required to build VOLK") + message(FATAL_ERROR "Python 2.7 or greater required to build VOLK") +endif() + +# Mako +if(NOT MAKO_FOUND) + message(FATAL_ERROR "Mako templates required to build VOLK") endif() -# Cheetah -if(NOT CHEETAH_FOUND) - message(FATAL_ERROR "Cheetah templates required to build VOLK") +# Six +if(NOT SIX_FOUND) + message(FATAL_ERROR "six - python 2 and 3 compatibility library required to build VOLK") endif() # Boost @@ -117,7 +130,7 @@ ${CMAKE_BINARY_DIR}/Doxyfile @ONLY) - add_custom_target(doc + add_custom_target(volk_doc ${DOXYGEN_EXECUTABLE} ${CMAKE_BINARY_DIR}/Doxyfile WORKING_DIRECTORY ${CMAKE_BINARY_DIR} COMMENT "Generating documentation with Doxygen" VERBATIM @@ -161,7 +174,9 @@ ${CMAKE_SOURCE_DIR}/include/volk/volk_prefs.h ${CMAKE_SOURCE_DIR}/include/volk/volk_complex.h ${CMAKE_SOURCE_DIR}/include/volk/volk_common.h + ${CMAKE_SOURCE_DIR}/include/volk/saturation_arithmetic.h ${CMAKE_SOURCE_DIR}/include/volk/volk_avx_intrinsics.h + ${CMAKE_SOURCE_DIR}/include/volk/volk_sse_intrinsics.h ${CMAKE_SOURCE_DIR}/include/volk/volk_sse3_intrinsics.h ${CMAKE_SOURCE_DIR}/include/volk/volk_neon_intrinsics.h ${CMAKE_BINARY_DIR}/include/volk/volk.h diff -Nru volk-1.3/debian/changelog volk-1.4/debian/changelog --- volk-1.3/debian/changelog 2018-02-04 18:12:21.000000000 +0000 +++ volk-1.4/debian/changelog 2019-08-25 11:32:49.000000000 +0000 @@ -1,3 +1,39 @@ +volk (1.4-3~ubuntu18.04.1~ppa1) bionic; urgency=medium + + * No-change backport to bionic + + -- Alexandru Csete Sun, 25 Aug 2019 13:32:49 +0200 + +volk (1.4-3) unstable; urgency=medium + + * update to v1.4-9-g297fefd + Added an AVX protokernel for volk_32fc_x2_32f_square_dist_scalar_mult_32f + fixed a buffer over-read and over-write in + volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx + Fix 32u_reverse_32u for ARM + + -- A. Maitland Bottoms Sat, 12 May 2018 15:25:04 -0400 + +volk (1.4-2) unstable; urgency=medium + + * Upload to unstable, needed by gnuradio (>= 3.7.12.0) + + -- A. Maitland Bottoms Tue, 03 Apr 2018 01:03:19 -0400 + +volk (1.4-1) experimental; urgency=medium + + * New upstream release + upstream changelog http://libvolk.org/release-v14.html + + -- A. Maitland Bottoms Tue, 27 Mar 2018 22:57:42 -0400 + +volk (1.3.1-1) unstable; urgency=medium + + * New upstream bugfix release + * Refresh all debian patches for use with git am + + -- A. Maitland Bottoms Tue, 27 Mar 2018 21:54:29 -0400 + volk (1.3-3) unstable; urgency=medium * update to v1.3-23-g0109b2e diff -Nru volk-1.3/debian/control volk-1.4/debian/control --- volk-1.3/debian/control 2018-02-04 18:12:21.000000000 +0000 +++ volk-1.4/debian/control 2018-05-12 19:25:04.000000000 +0000 @@ -1,6 +1,6 @@ Source: volk Section: libdevel -Priority: extra +Priority: optional Maintainer: A. Maitland Bottoms Build-Depends: cmake, debhelper (>= 9.0.0~), @@ -13,13 +13,17 @@ liborc-0.4-dev, pkg-config, python, - python-cheetah -Standards-Version: 4.1.3 + python3, + python-mako, + python3-mako, + python-six, + python3-six +Standards-Version: 4.1.4 Homepage: http://libvolk.org Vcs-Browser: https://salsa.debian.org/bottoms/pkg-volk Vcs-Git: https://salsa.debian.org/bottoms/pkg-volk.git -Package: libvolk1.3 +Package: libvolk1.4 Section: libs Architecture: any Pre-Depends: ${misc:Pre-Depends} @@ -36,7 +40,7 @@ Package: libvolk1-dev Architecture: any Pre-Depends: ${misc:Pre-Depends} -Depends: libvolk1.3 (=${binary:Version}), ${misc:Depends} +Depends: libvolk1.4 (=${binary:Version}), ${misc:Depends} Breaks: gnuradio-dev (<<3.7.8), libvolk-dev, libvolk1.0-dev Replaces: gnuradio-dev (<<3.7.8), libvolk-dev, libvolk1.0-dev Multi-Arch: same @@ -53,7 +57,7 @@ Section: libs Architecture: any Pre-Depends: ${misc:Pre-Depends} -Depends: libvolk1.3 (=${binary:Version}), +Depends: libvolk1.4 (=${binary:Version}), ${misc:Depends}, ${python:Depends}, ${shlibs:Depends} diff -Nru volk-1.3/debian/libvolk1.3.install volk-1.4/debian/libvolk1.3.install --- volk-1.3/debian/libvolk1.3.install 2016-07-02 20:52:58.000000000 +0000 +++ volk-1.4/debian/libvolk1.3.install 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ -usr/lib/*/libvolk.so.1.3 diff -Nru volk-1.3/debian/libvolk1.4.install volk-1.4/debian/libvolk1.4.install --- volk-1.3/debian/libvolk1.4.install 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/debian/libvolk1.4.install 2018-03-28 02:59:46.000000000 +0000 @@ -0,0 +1 @@ +usr/lib/*/libvolk.so.* diff -Nru volk-1.3/debian/libvolk1-dev_1.4-1_report.html volk-1.4/debian/libvolk1-dev_1.4-1_report.html --- volk-1.3/debian/libvolk1-dev_1.4-1_report.html 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/debian/libvolk1-dev_1.4-1_report.html 2018-03-28 03:42:28.000000000 +0000 @@ -0,0 +1,1069 @@ + + + + + + + + +libvolk1-dev: 1.3-3 to 1.4-1 compatibility report + + + +

API compatibility report for the libvolk1-dev library between 1.3-3 and 1.4-1 versions on x86_64

+ +
+

+ Binary
Compatibility + Source
Compatibility +

Test Info

+ + + + + + + +

Library Name	libvolk1-dev
Version #1	1.3-3
Version #2	1.4-1
Arch	x86_64
GCC Version	7
Subject	Binary Compatibility

Test Results

+ + + + + + +

Total Header Files	135
Total Libraries	1
Total Symbols / Types	614 / 233
Compatibility	99.8%

Problem Summary

+ + + + + + + + + +

	Severity	Count
Added Symbols	-	45
Removed Symbols	High	0
Problems with Data Types	High	0
	Medium	0
	Low	1
Problems with Symbols	High	1
	Medium	0
	Low	0
Problems with Constants	Low	1

+ +

Added Symbols 45

+volk.h, libvolk.so.1.4
+volk_32f_64f_add_64f [data]
+volk_32f_64f_add_64f_a [data]
+volk_32f_64f_add_64f_get_func_desc ( )
+volk_32f_64f_add_64f_manual ( double* cVector, float const* aVector, double const* bVector, unsigned int num_points, char const* impl_name )
+volk_32f_64f_add_64f_u [data]
+volk_32f_64f_multiply_64f [data]
+volk_32f_64f_multiply_64f_a [data]
+volk_32f_64f_multiply_64f_get_func_desc ( )
+volk_32f_64f_multiply_64f_manual ( double* cVector, float const* aVector, double const* bVector, unsigned int num_points, char const* impl_name )
+volk_32f_64f_multiply_64f_u [data]
+volk_32f_s32f_mod_rangepuppet_32f [data]
+volk_32f_s32f_mod_rangepuppet_32f_a [data]
+volk_32f_s32f_mod_rangepuppet_32f_get_func_desc ( )
+volk_32f_s32f_mod_rangepuppet_32f_manual ( float* output, float const* input, float bound, unsigned int num_points, char const* impl_name )
+volk_32f_s32f_mod_rangepuppet_32f_u [data]
+volk_32f_s32f_s32f_mod_range_32f [data]
+volk_32f_s32f_s32f_mod_range_32f_a [data]
+volk_32f_s32f_s32f_mod_range_32f_get_func_desc ( )
+volk_32f_s32f_s32f_mod_range_32f_manual ( float* outputVector, float const* inputVector, float const lower_bound, float const upper_bound, unsigned int num_points, char const* impl_name )
+volk_32f_s32f_s32f_mod_range_32f_u [data]
+volk_32fc_32f_add_32fc [data]
+volk_32fc_32f_add_32fc_a [data]
+volk_32fc_32f_add_32fc_get_func_desc ( )
+volk_32fc_32f_add_32fc_manual ( lv_32fc_t* cVector, lv_32fc_t const* aVector, float const* bVector, unsigned int num_points, char const* impl_name )
+volk_32fc_32f_add_32fc_u [data]
+volk_32fc_x2_add_32fc [data]
+volk_32fc_x2_add_32fc_a [data]
+volk_32fc_x2_add_32fc_get_func_desc ( )
+volk_32fc_x2_add_32fc_manual ( lv_32fc_t* cVector, lv_32fc_t const* aVector, lv_32fc_t const* bVector, unsigned int num_points, char const* impl_name )
+volk_32fc_x2_add_32fc_u [data]
+volk_32u_reverse_32u [data]
+volk_32u_reverse_32u_a [data]
+volk_32u_reverse_32u_get_func_desc ( )
+volk_32u_reverse_32u_manual ( uint32_t* out, uint32_t const* in, unsigned int num_points, char const* impl_name )
+volk_32u_reverse_32u_u [data]
+volk_64f_x2_add_64f [data]
+volk_64f_x2_add_64f_a [data]
+volk_64f_x2_add_64f_get_func_desc ( )
+volk_64f_x2_add_64f_manual ( double* cVector, double const* aVector, double const* bVector, unsigned int num_points, char const* impl_name )
+volk_64f_x2_add_64f_u [data]
+volk_64f_x2_multiply_64f [data]
+volk_64f_x2_multiply_64f_a [data]
+volk_64f_x2_multiply_64f_get_func_desc ( )
+volk_64f_x2_multiply_64f_manual ( double* cVector, double const* aVector, double const* bVector, unsigned int num_points, char const* impl_name )
+volk_64f_x2_multiply_64f_u [data]
+
+to the top
+ +

Problems with Symbols, High Severity 1

	Change	Effect
1	3rd middle parameter frame_size has been removed from the calling stack.	Layout of parameter's stack has been changed and therefore parameters at higher positions in the stack may be incorrectly initialized by applications.

+volk.h, libvolk.so.1.3
+ +[+] volk_32f_8u_polarbutterfly_32f_manual ( float* llrs, unsigned char* u, int const frame_size, int const frame_exp, int const stage, int const u_num, int const row, char const* impl_name ) 1 +
+ +
+to the top
+ +

Problems with Data Types, Low Severity 1

	Change	Effect
1	Base type has been changed from void()(float, unsigned char, int, int, int, int, int) to void()(float, unsigned char, int, int, int, int).	Replacement of the base data type may indicate a change in its semantic meaning.

+volk_typedefs.h
+ +[+] typedef p_32f_8u_polarbutterfly_32f 1 +
+ + +
+to the top
+ +

Problems with Constants, Low Severity 1

	Change	Effect
1	The constant LOG_POLY_DEGREE with value 6 has been removed.	The value of this constant may no longer be properly handled by new-version library functions.

+volk_32f_log2_32f.h
+ +[+] LOG_POLY_DEGREE +
+ + +
+to the top
+

Header Files 135

+constants.h
+saturation_arithmetic.h
+volk.h
+volk_16i_32fc_dot_prod_32fc.h
+volk_16i_branch_4_state_8.h
+volk_16i_convert_8i.h
+volk_16i_max_star_16i.h
+volk_16i_max_star_horizontal_16i.h
+volk_16i_permute_and_scalar_add.h
+volk_16i_s32f_convert_32f.h
+volk_16i_x4_quad_max_star_16i.h
+volk_16i_x5_add_quad_16i_x4.h
+volk_16ic_convert_32fc.h
+volk_16ic_deinterleave_16i_x2.h
+volk_16ic_deinterleave_real_16i.h
+volk_16ic_deinterleave_real_8i.h
+volk_16ic_magnitude_16i.h
+volk_16ic_s32f_deinterleave_32f_x2.h
+volk_16ic_s32f_deinterleave_real_32f.h
+volk_16ic_s32f_magnitude_32f.h
+volk_16ic_x2_dot_prod_16ic.h
+volk_16ic_x2_multiply_16ic.h
+volk_16u_byteswap.h
+volk_16u_byteswappuppet_16u.h
+volk_32f_8u_polarbutterfly_32f.h
+volk_32f_8u_polarbutterflypuppet_32f.h
+volk_32f_accumulator_s32f.h
+volk_32f_acos_32f.h
+volk_32f_asin_32f.h
+volk_32f_atan_32f.h
+volk_32f_binary_slicer_32i.h
+volk_32f_binary_slicer_8i.h
+volk_32f_convert_64f.h
+volk_32f_cos_32f.h
+volk_32f_expfast_32f.h
+volk_32f_index_max_16u.h
+volk_32f_index_max_32u.h
+volk_32f_invsqrt_32f.h
+volk_32f_log2_32f.h
+volk_32f_null_32f.h
+volk_32f_s32f_32f_fm_detect_32f.h
+volk_32f_s32f_calc_spectral_noise_floor_32f.h
+volk_32f_s32f_convert_16i.h
+volk_32f_s32f_convert_32i.h
+volk_32f_s32f_convert_8i.h
+volk_32f_s32f_multiply_32f.h
+volk_32f_s32f_normalize.h
+volk_32f_s32f_power_32f.h
+volk_32f_s32f_stddev_32f.h
+volk_32f_sin_32f.h
+volk_32f_sqrt_32f.h
+volk_32f_stddev_and_mean_32f_x2.h
+volk_32f_tan_32f.h
+volk_32f_tanh_32f.h
+volk_32f_x2_add_32f.h
+volk_32f_x2_divide_32f.h
+volk_32f_x2_dot_prod_16i.h
+volk_32f_x2_dot_prod_32f.h
+volk_32f_x2_fm_detectpuppet_32f.h
+volk_32f_x2_interleave_32fc.h
+volk_32f_x2_max_32f.h
+volk_32f_x2_min_32f.h
+volk_32f_x2_multiply_32f.h
+volk_32f_x2_pow_32f.h
+volk_32f_x2_s32f_interleave_16ic.h
+volk_32f_x2_subtract_32f.h
+volk_32f_x3_sum_of_poly_32f.h
+volk_32fc_32f_dot_prod_32fc.h
+volk_32fc_32f_multiply_32fc.h
+volk_32fc_conjugate_32fc.h
+volk_32fc_convert_16ic.h
+volk_32fc_deinterleave_32f_x2.h
+volk_32fc_deinterleave_64f_x2.h
+volk_32fc_deinterleave_imag_32f.h
+volk_32fc_deinterleave_real_32f.h
+volk_32fc_deinterleave_real_64f.h
+volk_32fc_index_max_16u.h
+volk_32fc_index_max_32u.h
+volk_32fc_magnitude_32f.h
+volk_32fc_magnitude_squared_32f.h
+volk_32fc_s32f_atan2_32f.h
+volk_32fc_s32f_deinterleave_real_16i.h
+volk_32fc_s32f_magnitude_16i.h
+volk_32fc_s32f_power_32fc.h
+volk_32fc_s32f_power_spectrum_32f.h
+volk_32fc_s32f_x2_power_spectral_density_32f.h
+volk_32fc_s32fc_multiply_32fc.h
+volk_32fc_s32fc_rotatorpuppet_32fc.h
+volk_32fc_s32fc_x2_rotator_32fc.h
+volk_32fc_x2_conjugate_dot_prod_32fc.h
+volk_32fc_x2_divide_32fc.h
+volk_32fc_x2_dot_prod_32fc.h
+volk_32fc_x2_multiply_32fc.h
+volk_32fc_x2_multiply_conjugate_32fc.h
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
+volk_32fc_x2_square_dist_32f.h
+volk_32i_s32f_convert_32f.h
+volk_32i_x2_and_32i.h
+volk_32i_x2_or_32i.h
+volk_32u_byteswap.h
+volk_32u_byteswappuppet_32u.h
+volk_32u_popcnt.h
+volk_32u_popcntpuppet_32u.h
+volk_64f_convert_32f.h
+volk_64f_x2_max_64f.h
+volk_64f_x2_min_64f.h
+volk_64u_byteswap.h
+volk_64u_byteswappuppet_64u.h
+volk_64u_popcnt.h
+volk_64u_popcntpuppet_64u.h
+volk_8i_convert_16i.h
+volk_8i_s32f_convert_32f.h
+volk_8ic_deinterleave_16i_x2.h
+volk_8ic_deinterleave_real_16i.h
+volk_8ic_deinterleave_real_8i.h
+volk_8ic_s32f_deinterleave_32f_x2.h
+volk_8ic_s32f_deinterleave_real_32f.h
+volk_8ic_x2_multiply_conjugate_16ic.h
+volk_8ic_x2_s32f_multiply_conjugate_32fc.h
+volk_8u_conv_k7_r2puppet_8u.h
+volk_8u_x2_encodeframepolar_8u.h
+volk_8u_x3_encodepolar_8u_x2.h
+volk_8u_x3_encodepolarpuppet_8u.h
+volk_8u_x4_conv_k7_r2_8u.h
+volk_avx_intrinsics.h
+volk_common.h
+volk_complex.h
+volk_config_fixed.h
+volk_cpu.h
+volk_malloc.h
+volk_neon_intrinsics.h
+volk_prefs.h
+volk_sse3_intrinsics.h
+volk_sse_intrinsics.h
+volk_typedefs.h
+

+
to the top
+

Libraries 1

+libvolk.so.1.3
+

+
to the top
+

Test Info

+ + + + + + +

Library Name	libvolk1-dev
Version #1	1.3-3
Version #2	1.4-1
Arch	x86_64
Subject	Source Compatibility

Test Results

+ + + + + + +

Total Header Files	135
Total Libraries	1
Total Symbols / Types	660 / 235
Compatibility	99.1%

Problem Summary

+ + + + + + + + + + +

	Severity	Count
Added Symbols	-	46
Removed Symbols	High	5
Problems with Data Types	High	0
	Medium	0
	Low	1
Problems with Symbols	High	1
	Medium	0
	Low	0
Problems with Constants	Low	1
Other Changes in Constants	-	2

+ +

Added Symbols 46

+volk.h
+volk_32f_64f_add_64f [data]
+volk_32f_64f_add_64f_a [data]
+volk_32f_64f_add_64f_get_func_desc ( )
+volk_32f_64f_add_64f_manual ( double* cVector, float const* aVector, double const* bVector, unsigned int num_points, char const* impl_name )
+volk_32f_64f_add_64f_u [data]
+volk_32f_64f_multiply_64f [data]
+volk_32f_64f_multiply_64f_a [data]
+volk_32f_64f_multiply_64f_get_func_desc ( )
+volk_32f_64f_multiply_64f_manual ( double* cVector, float const* aVector, double const* bVector, unsigned int num_points, char const* impl_name )
+volk_32f_64f_multiply_64f_u [data]
+volk_32f_s32f_mod_rangepuppet_32f [data]
+volk_32f_s32f_mod_rangepuppet_32f_a [data]
+volk_32f_s32f_mod_rangepuppet_32f_get_func_desc ( )
+volk_32f_s32f_mod_rangepuppet_32f_manual ( float* output, float const* input, float bound, unsigned int num_points, char const* impl_name )
+volk_32f_s32f_mod_rangepuppet_32f_u [data]
+volk_32f_s32f_s32f_mod_range_32f [data]
+volk_32f_s32f_s32f_mod_range_32f_a [data]
+volk_32f_s32f_s32f_mod_range_32f_get_func_desc ( )
+volk_32f_s32f_s32f_mod_range_32f_manual ( float* outputVector, float const* inputVector, float const lower_bound, float const upper_bound, unsigned int num_points, char const* impl_name )
+volk_32f_s32f_s32f_mod_range_32f_u [data]
+volk_32fc_32f_add_32fc [data]
+volk_32fc_32f_add_32fc_a [data]
+volk_32fc_32f_add_32fc_get_func_desc ( )
+volk_32fc_32f_add_32fc_manual ( lv_32fc_t* cVector, lv_32fc_t const* aVector, float const* bVector, unsigned int num_points, char const* impl_name )
+volk_32fc_32f_add_32fc_u [data]
+volk_32fc_x2_add_32fc [data]
+volk_32fc_x2_add_32fc_a [data]
+volk_32fc_x2_add_32fc_get_func_desc ( )
+volk_32fc_x2_add_32fc_manual ( lv_32fc_t* cVector, lv_32fc_t const* aVector, lv_32fc_t const* bVector, unsigned int num_points, char const* impl_name )
+volk_32fc_x2_add_32fc_u [data]
+volk_32u_reverse_32u [data]
+volk_32u_reverse_32u_a [data]
+volk_32u_reverse_32u_get_func_desc ( )
+volk_32u_reverse_32u_manual ( uint32_t* out, uint32_t const* in, unsigned int num_points, char const* impl_name )
+volk_32u_reverse_32u_u [data]
+volk_64f_x2_add_64f [data]
+volk_64f_x2_add_64f_a [data]
+volk_64f_x2_add_64f_get_func_desc ( )
+volk_64f_x2_add_64f_manual ( double* cVector, double const* aVector, double const* bVector, unsigned int num_points, char const* impl_name )
+volk_64f_x2_add_64f_u [data]
+volk_64f_x2_multiply_64f [data]
+volk_64f_x2_multiply_64f_a [data]
+volk_64f_x2_multiply_64f_get_func_desc ( )
+volk_64f_x2_multiply_64f_manual ( double* cVector, double const* aVector, double const* bVector, unsigned int num_points, char const* impl_name )
+volk_64f_x2_multiply_64f_u [data]
+
+volk_32u_reverse_32u.h
+ +BitReverseTable256 [data] +
+ + +
+to the top
+

Removed Symbols 5

+constants.h
+volk_available_machines ( )
+volk_c_compiler ( )
+volk_compiler_flags ( )
+volk_prefix ( )
+volk_version ( )
+
+to the top
+ +

Problems with Symbols, High Severity 1

	Change	Effect
1	3rd middle parameter frame_size has been removed from the calling stack.	Recompilation of a client program may be broken.

+volk.h
+ +[+] volk_32f_8u_polarbutterfly_32f_manual ( float* llrs, unsigned char* u, int const frame_size, int const frame_exp, int const stage, int const u_num, int const row, char const* impl_name ) 1 +
+ +
+to the top
+ +

Problems with Data Types, Low Severity 1

	Change	Effect
1	Base type has been changed from void()(float, unsigned char, int, int, int, int, int) to void()(float, unsigned char, int, int, int, int).	Recompilation of a client program may be broken.

+volk_typedefs.h
+ +[+] typedef p_32f_8u_polarbutterfly_32f 1 +
+ + +
+to the top
+ +

Problems with Constants, Low Severity 1

	Change	Effect
1	The constant LOG_POLY_DEGREE with value 6 has been removed.	Recompilation of a client program may be broken.

+volk_32f_log2_32f.h
+ +[+] LOG_POLY_DEGREE +
+ + +
+to the top
+ +

Other Changes in Constants 2

	Change	Effect
1	The constant __VOLK_ASM with value __asm__ has been added.	No effect.

	Change	Effect
1	The constant __VOLK_VOLATILE with value __volatile__ has been added.	No effect.

+volk_common.h
+ +[+] __VOLK_ASM +
+ + + +[+] __VOLK_VOLATILE +
+ + +
+to the top
+

Header Files 135

+
to the top
+

Libraries 1

+libvolk.so.1.3
+

+
to the top
+

+ +
+ + Binary files /tmp/tmpPx0NnN/uw5Szp_LXu/volk-1.3/debian/libvolk1-dev.abi.tar.gz.amd64 and /tmp/tmpPx0NnN/L73hFeo27g/volk-1.4/debian/libvolk1-dev.abi.tar.gz.amd64 differ diff -Nru volk-1.3/debian/libvolk1-dev.acc volk-1.4/debian/libvolk1-dev.acc --- volk-1.3/debian/libvolk1-dev.acc 2016-07-02 20:53:52.000000000 +0000 +++ volk-1.4/debian/libvolk1-dev.acc 2018-04-03 18:00:06.000000000 +0000 @@ -6,7 +6,7 @@ -debian/libvolk1.3/usr/lib/ +debian/libvolk1.4/usr/lib/ diff -Nru volk-1.3/debian/patches/0001-Add-a-AppVeyor-compatible-YAML-file-for-building-on-.patch volk-1.4/debian/patches/0001-Add-a-AppVeyor-compatible-YAML-file-for-building-on-.patch --- volk-1.3/debian/patches/0001-Add-a-AppVeyor-compatible-YAML-file-for-building-on-.patch 2018-02-04 18:08:35.000000000 +0000 +++ volk-1.4/debian/patches/0001-Add-a-AppVeyor-compatible-YAML-file-for-building-on-.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,76 +0,0 @@ -From 4461f27f6533cf29baaac0ff9cdd9b6241c0840b Mon Sep 17 00:00:00 2001 -From: Paul Cercueil -Date: Wed, 17 Feb 2016 14:51:00 +0100 -Subject: [PATCH 01/18] Add a AppVeyor compatible YAML file for building on the - AppVeyor CI - -Signed-off-by: Paul Cercueil ---- - appveyor.yml | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 55 insertions(+) - create mode 100644 appveyor.yml - -diff --git a/appveyor.yml b/appveyor.yml -new file mode 100644 -index 0000000..052ea51 ---- /dev/null -+++ b/appveyor.yml -@@ -0,0 +1,55 @@ -+clone_depth: 1 -+ -+os: Visual Studio 2013 -+ -+install: -+ - echo "Installing Boost libraries..." -+ - nuget install boost_system-vc120 -+ - nuget install boost_filesystem-vc120 -+ - nuget install boost_chrono-vc120 -+ - nuget install boost_program_options-vc120 -+ - nuget install boost_unit_test_framework-vc120 -+ -+ - echo "Installing Cheetah templates..." -+ - appveyor DownloadFile https://pypi.python.org/packages/source/C/Cheetah/Cheetah-2.4.4.tar.gz -+ - 7z x Cheetah-2.4.4.tar.gz -+ - 7z x -y Cheetah-2.4.4.tar -+ - cd Cheetah-2.4.4 -+ - c:\Python27\python.exe setup.py build -+ - c:\Python27\python.exe setup.py install -+ -+build_script: -+ - cd c:\projects\volk -+ -+ # Without this directory in the %PATH%, compiler tests fail because of missing DLLs -+ - set PATH=%PATH%;C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin -+ -+ - cmake -G "Visual Studio 12 Win64" \ -+ -DBoost_CHRONO_LIBRARY_RELEASE:FILEPATH=c:/projects/volk/boost_chrono-vc120.1.59.0.0/lib/native/address-model-64/lib/boost_chrono-vc120-mt-1_59.lib \ -+ -DBoost_FILESYSTEM_LIBRARY_RELEASE:FILEPATH=c:/projects/volk/boost_filesystem-vc120.1.59.0.0/lib/native/address-model-64/lib/boost_filesystem-vc120-mt-1_59.lib \ -+ -DBoost_PROGRAM_OPTIONS_LIBRARY_RELEASE:FILEPATH=c:/projects/volk/boost_program_options-vc120.1.59.0.0/lib/native/address-model-64/lib/boost_program_options-vc120-mt-1_59.lib \ -+ -DBoost_SYSTEM_LIBRARY_RELEASE:FILEPATH=c:/projects/volk/boost_system-vc120.1.59.0.0/lib/native/address-model-64/lib/boost_system-vc120-mt-1_59.lib \ -+ -DBoost_UNIT_TEST_FRAMEWORK_LIBRARY_RELEASE:FILEPATH=c:/projects/volk/boost_unit_test_framework-vc120.1.59.0.0/lib/native/address-model-64/lib/boost_unit_test_framework-vc120-mt-1_59.lib \ -+ -DBoost_INCLUDE_DIR:PATH=c:/projects/volk/boost.1.59.0.0/lib/native/include \ -+ -DCMAKE_BUILD_TYPE:STRING=Release -DENABLE_ORC:BOOL=OFF -DENABLE_TESTING:BOOL=OFF \ -+ . -+ -+ - cmake --build . --config Release --target INSTALL -+ -+ # Create an archive -+ - cd "c:\Program Files" -+ - 7z a "c:\libvolk-x64.zip" volk -+ -+ # Create the deps archive -+ - mkdir dlls -+ - copy c:\projects\volk\boost_chrono-vc120.1.59.0.0\lib\native\address-model-64\lib\boost_chrono-vc120-mt-1_59.dll dlls\boost_chrono-vc120-mt-1_59.dll -+ - copy c:\projects\volk\boost_filesystem-vc120.1.59.0.0\lib\native\address-model-64\lib\boost_filesystem-vc120-mt-1_59.dll dlls\boost_filesystem-vc120-mt-1_59.dll -+ - copy c:\projects\volk\boost_program_options-vc120.1.59.0.0\lib\native\address-model-64\lib\boost_program_options-vc120-mt-1_59.dll dlls\boost_program_options-vc120-mt-1_59.dll -+ - copy c:\projects\volk\boost_system-vc120.1.59.0.0\lib\native\address-model-64\lib\boost_system-vc120-mt-1_59.dll dlls\boost_system-vc120-mt-1_59.dll -+ - copy c:\projects\volk\boost_unit_test_framework-vc120.1.59.0.0\lib\native\address-model-64\lib\boost_unit_test_framework-vc120-mt-1_59.dll dlls\boost_unit_test_framework-vc120-mt-1_59.dll -+ - cd dlls -+ - 7z a "c:\libvolk-x64-deps.zip" * -+ -+ # Push it! -+ - appveyor PushArtifact c:\libvolk-x64.zip -+ - appveyor PushArtifact c:\libvolk-x64-deps.zip --- -2.11.0 - diff -Nru volk-1.3/debian/patches/0001-Update-CMakeLists-for-1.5-development-versions.patch volk-1.4/debian/patches/0001-Update-CMakeLists-for-1.5-development-versions.patch --- volk-1.3/debian/patches/0001-Update-CMakeLists-for-1.5-development-versions.patch 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/debian/patches/0001-Update-CMakeLists-for-1.5-development-versions.patch 2018-05-07 18:45:54.000000000 +0000 @@ -0,0 +1,25 @@ +From da56e316ca095a09b53ce308a79b522bcb4a6504 Mon Sep 17 00:00:00 2001 +From: Nathan West +Date: Tue, 27 Mar 2018 12:50:12 -0400 +Subject: [PATCH 1/9] Update CMakeLists for 1.5 development versions + +--- + CMakeLists.txt | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index a02ae80..d0f076c 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -45,7 +45,7 @@ message(STATUS "Build type set to ${CMAKE_BUILD_TYPE}.") + + set(VERSION_INFO_MAJOR_VERSION 1) + set(VERSION_INFO_MINOR_VERSION 4) +-set(VERSION_INFO_MAINT_VERSION 0) ++set(VERSION_INFO_MAINT_VERSION 1git) + include(VolkVersion) #setup version info + + ######################################################################## +-- +2.11.0 + diff -Nru volk-1.3/debian/patches/0002-Added-an-AVX-protokernel-for-volk_32fc_x2_32f_square.patch volk-1.4/debian/patches/0002-Added-an-AVX-protokernel-for-volk_32fc_x2_32f_square.patch --- volk-1.3/debian/patches/0002-Added-an-AVX-protokernel-for-volk_32fc_x2_32f_square.patch 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/debian/patches/0002-Added-an-AVX-protokernel-for-volk_32fc_x2_32f_square.patch 2018-05-07 18:45:54.000000000 +0000 @@ -0,0 +1,98 @@ +From 83832b2b922cb3ed979aa520ef7ce27557d5f705 Mon Sep 17 00:00:00 2001 +From: "Brandon P. Enochs" +Date: Tue, 3 Apr 2018 16:22:16 -0400 +Subject: [PATCH 2/9] Added an AVX protokernel for + volk_32fc_x2_32f_square_dist_scalar_mult_32f. + +--- + ...volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h | 63 ++++++++++++++++++++-- + 1 file changed, 60 insertions(+), 3 deletions(-) + +diff --git a/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h b/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h +index e983578..6180337 100644 +--- a/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h ++++ b/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h +@@ -84,6 +84,65 @@ + #include + #include + ++#ifdef LV_HAVE_AVX ++#include ++ ++static inline void ++volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx( ++ float *target, lv_32fc_t *src0, lv_32fc_t *points, ++ float scalar, unsigned int num_points) { ++ static const unsigned int work_size = 8; ++ unsigned int avx_work_size = num_points / work_size * work_size; ++ int i = 0; ++ ++ for (; i < avx_work_size; i += work_size) { ++ lv_32fc_t src = *src0; ++ __m256 source = _mm256_setr_ps( ++ lv_creal(src), lv_cimag(src), ++ lv_creal(src), lv_cimag(src), ++ lv_creal(src), lv_cimag(src), ++ lv_creal(src), lv_cimag(src) ++ ); ++ __m256 points_low = _mm256_load_ps((const float *) points); ++ __m256 points_high = _mm256_load_ps((const float *) (points + work_size / 2)); ++ __m256 difference_low = _mm256_sub_ps(source, points_low); ++ __m256 difference_high = _mm256_sub_ps(source, points_high); ++ ++ difference_low = _mm256_mul_ps(difference_low, difference_low); ++ difference_high = _mm256_mul_ps(difference_high, difference_high); ++ ++ __m256 magnitudes_squared = _mm256_hadd_ps(difference_low, difference_high); ++ __m128 lower_magnitudes_squared_bottom = _mm256_extractf128_ps(magnitudes_squared, 0); ++ __m128 upper_magnitudes_squared_top = _mm256_extractf128_ps(magnitudes_squared, 1); ++ __m256 lower_magnitudes_squared = _mm256_castps128_ps256(lower_magnitudes_squared_bottom); ++ ++ lower_magnitudes_squared = _mm256_insertf128_ps( ++ lower_magnitudes_squared, _mm_permute_ps(lower_magnitudes_squared_bottom, 0x4E), 1 ++ ); ++ ++ __m256 upper_magnitudes_squared = _mm256_castps128_ps256(upper_magnitudes_squared_top); ++ ++ upper_magnitudes_squared = _mm256_insertf128_ps(upper_magnitudes_squared, upper_magnitudes_squared_top, 1); ++ upper_magnitudes_squared_top = _mm_permute_ps(upper_magnitudes_squared_top, 0x4E); ++ upper_magnitudes_squared = _mm256_insertf128_ps(upper_magnitudes_squared, upper_magnitudes_squared_top, 0); ++ ++ __m256 ordered_magnitudes_squared = _mm256_blend_ps(lower_magnitudes_squared, upper_magnitudes_squared, 0xCC); ++ __m256 scalars = _mm256_set1_ps(scalar); ++ __m256 output = _mm256_mul_ps(ordered_magnitudes_squared, scalars); ++ ++ _mm256_store_ps(target, output); ++ target += work_size; ++ points += work_size; ++ } ++ for (; i < num_points; ++i) { ++ lv_32fc_t diff = src0[0] - points[i]; ++ ++ target[i] = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); ++ } ++} ++ ++#endif /* LV_HAVE_AVX */ ++ + #ifdef LV_HAVE_SSE3 + #include + #include +@@ -183,13 +242,11 @@ static inline void + volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, + float scalar, unsigned int num_points) + { +- const unsigned int num_bytes = num_points*8; +- + lv_32fc_t diff; + float sq_dist; + unsigned int i = 0; + +- for(; i < num_bytes >> 3; ++i) { ++ for(; i < num_points; ++i) { + diff = src0[0] - points[i]; + + sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); +-- +2.11.0 + diff -Nru volk-1.3/debian/patches/0002-Update-CMakeLists-for-1.3-development.patch volk-1.4/debian/patches/0002-Update-CMakeLists-for-1.3-development.patch --- volk-1.3/debian/patches/0002-Update-CMakeLists-for-1.3-development.patch 2018-02-04 18:08:35.000000000 +0000 +++ volk-1.4/debian/patches/0002-Update-CMakeLists-for-1.3-development.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,25 +0,0 @@ -From 18428fb9f718f5f7fa34707dd47ab6db07d88683 Mon Sep 17 00:00:00 2001 -From: Nathan West -Date: Sat, 2 Jul 2016 12:01:28 -0400 -Subject: [PATCH 02/18] Update CMakeLists for 1.3 development - ---- - CMakeLists.txt | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/CMakeLists.txt b/CMakeLists.txt -index 5ecc9c2..0d0b647 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -45,7 +45,7 @@ message(STATUS "Build type set to ${CMAKE_BUILD_TYPE}.") - - set(VERSION_INFO_MAJOR_VERSION 1) - set(VERSION_INFO_MINOR_VERSION 3) --set(VERSION_INFO_MAINT_VERSION 0) -+set(VERSION_INFO_MAINT_VERSION 0git) - include(VolkVersion) #setup version info - - --- -2.11.0 - diff -Nru volk-1.3/debian/patches/0003-apps-fix-profile-update-reading-end-of-lines.patch volk-1.4/debian/patches/0003-apps-fix-profile-update-reading-end-of-lines.patch --- volk-1.3/debian/patches/0003-apps-fix-profile-update-reading-end-of-lines.patch 2018-02-04 18:08:35.000000000 +0000 +++ volk-1.4/debian/patches/0003-apps-fix-profile-update-reading-end-of-lines.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,25 +0,0 @@ -From e296749b8fe936f72e85cbaca57215cb528ed2e5 Mon Sep 17 00:00:00 2001 -From: Nathan West -Date: Mon, 1 Aug 2016 17:12:24 -0400 -Subject: [PATCH 03/18] apps: fix profile update reading end of lines - ---- - apps/volk_profile.cc | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/apps/volk_profile.cc b/apps/volk_profile.cc -index 2086e3f..51591cc 100644 ---- a/apps/volk_profile.cc -+++ b/apps/volk_profile.cc -@@ -261,7 +261,7 @@ void read_results(std::vector *results, std::string path) - found = 127; - } - str_size = config_str.size(); -- char buffer[128]; -+ char buffer[128] = {'\0'}; - config_str.copy(buffer, found + 1, 0); - buffer[found] = '\0'; - single_kernel_result.push_back(std::string(buffer)); --- -2.11.0 - diff -Nru volk-1.3/debian/patches/0003-extracted-variables-for-the-source-real-and-imaginar.patch volk-1.4/debian/patches/0003-extracted-variables-for-the-source-real-and-imaginar.patch --- volk-1.3/debian/patches/0003-extracted-variables-for-the-source-real-and-imaginar.patch 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/debian/patches/0003-extracted-variables-for-the-source-real-and-imaginar.patch 2018-05-07 18:45:54.000000000 +0000 @@ -0,0 +1,33 @@ +From 446c79567fd7b39b9f0fabc72bcb952df8778018 Mon Sep 17 00:00:00 2001 +From: "Brandon P. Enochs" +Date: Tue, 3 Apr 2018 16:29:40 -0400 +Subject: [PATCH 3/9] extracted variables for the source real and imaginary + parts in volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx. + +--- + kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h | 9 +++------ + 1 file changed, 3 insertions(+), 6 deletions(-) + +diff --git a/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h b/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h +index 6180337..0a3f7ca 100644 +--- a/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h ++++ b/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h +@@ -97,12 +97,9 @@ volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx( + + for (; i < avx_work_size; i += work_size) { + lv_32fc_t src = *src0; +- __m256 source = _mm256_setr_ps( +- lv_creal(src), lv_cimag(src), +- lv_creal(src), lv_cimag(src), +- lv_creal(src), lv_cimag(src), +- lv_creal(src), lv_cimag(src) +- ); ++ float src_real = lv_creal(src); ++ float src_imag = lv_cimag(src); ++ __m256 source = _mm256_setr_ps(src_real, src_imag, src_real, src_imag, src_real, src_imag, src_real, src_imag); + __m256 points_low = _mm256_load_ps((const float *) points); + __m256 points_high = _mm256_load_ps((const float *) (points + work_size / 2)); + __m256 difference_low = _mm256_sub_ps(source, points_low); +-- +2.11.0 + diff -Nru volk-1.3/debian/patches/0004-apps-fix-profile-update-reading-end-of-lines.patch volk-1.4/debian/patches/0004-apps-fix-profile-update-reading-end-of-lines.patch --- volk-1.3/debian/patches/0004-apps-fix-profile-update-reading-end-of-lines.patch 2018-02-04 18:08:35.000000000 +0000 +++ volk-1.4/debian/patches/0004-apps-fix-profile-update-reading-end-of-lines.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,25 +0,0 @@ -From 0d672945dfca506d4e49e857ce886d2b3dc80e96 Mon Sep 17 00:00:00 2001 -From: Nathan West -Date: Mon, 1 Aug 2016 17:12:24 -0400 -Subject: [PATCH 04/18] apps: fix profile update reading end of lines - ---- - apps/volk_profile.cc | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/apps/volk_profile.cc b/apps/volk_profile.cc -index 2086e3f..51591cc 100644 ---- a/apps/volk_profile.cc -+++ b/apps/volk_profile.cc -@@ -261,7 +261,7 @@ void read_results(std::vector *results, std::string path) - found = 127; - } - str_size = config_str.size(); -- char buffer[128]; -+ char buffer[128] = {'\0'}; - config_str.copy(buffer, found + 1, 0); - buffer[found] = '\0'; - single_kernel_result.push_back(std::string(buffer)); --- -2.11.0 - diff -Nru volk-1.3/debian/patches/0004-fixed-a-buffer-over-read-and-over-write-in-volk_32fc.patch volk-1.4/debian/patches/0004-fixed-a-buffer-over-read-and-over-write-in-volk_32fc.patch --- volk-1.3/debian/patches/0004-fixed-a-buffer-over-read-and-over-write-in-volk_32fc.patch 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/debian/patches/0004-fixed-a-buffer-over-read-and-over-write-in-volk_32fc.patch 2018-05-07 18:45:54.000000000 +0000 @@ -0,0 +1,31 @@ +From 3d0f47381454cf6f79fddb081cee0a50b2a684eb Mon Sep 17 00:00:00 2001 +From: "Brandon P. Enochs" +Date: Tue, 3 Apr 2018 18:20:53 -0400 +Subject: [PATCH 4/9] fixed a buffer over-read and over-write in + volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx. + +--- + kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h b/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h +index 0a3f7ca..8f5b02a 100644 +--- a/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h ++++ b/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h +@@ -132,9 +132,11 @@ volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx( + points += work_size; + } + for (; i < num_points; ++i) { +- lv_32fc_t diff = src0[0] - points[i]; ++ lv_32fc_t diff = src0[0] - *points; + +- target[i] = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); ++ *target = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); ++ ++target; ++ ++points; + } + } + +-- +2.11.0 + diff -Nru volk-1.3/debian/patches/0005-cmake-Fix-endif-to-match-if.patch volk-1.4/debian/patches/0005-cmake-Fix-endif-to-match-if.patch --- volk-1.3/debian/patches/0005-cmake-Fix-endif-to-match-if.patch 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/debian/patches/0005-cmake-Fix-endif-to-match-if.patch 2018-05-07 18:45:54.000000000 +0000 @@ -0,0 +1,23 @@ +From 21d86728946b915bfe2f177dfa97fb2c94a5957a Mon Sep 17 00:00:00 2001 +From: Michael Dickens +Date: Thu, 5 Apr 2018 14:02:57 -0400 +Subject: [PATCH 5/9] cmake: Fix 'endif' to match 'if' + +--- + cmake/Modules/VolkConfigVersion.cmake.in | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/cmake/Modules/VolkConfigVersion.cmake.in b/cmake/Modules/VolkConfigVersion.cmake.in +index 265daeb..988336d 100644 +--- a/cmake/Modules/VolkConfigVersion.cmake.in ++++ b/cmake/Modules/VolkConfigVersion.cmake.in +@@ -30,5 +30,5 @@ if(${PACKAGE_FIND_VERSION_MAJOR} EQUAL ${MAJOR_VERSION}) + set(PACKAGE_VERSION_EXACT 1) # exact match for API version + set(PACKAGE_VERSION_COMPATIBLE 1) # compat for minor/patch version + endif(NOT ${PACKAGE_FIND_VERSION_PATCH} GREATER ${MINOR_VERSION}) +- endif(${PACKAGE_FIND_VERSION_MINOR} EQUAL ${API_COMPAT}) ++ endif(${PACKAGE_FIND_VERSION_MINOR} EQUAL ${MINOR_VERSION}) + endif(${PACKAGE_FIND_VERSION_MAJOR} EQUAL ${MAJOR_VERSION}) +-- +2.11.0 + diff -Nru volk-1.3/debian/patches/0005-qa-lower-tolerance-for-32fc_mag-to-fix-issue-96.patch volk-1.4/debian/patches/0005-qa-lower-tolerance-for-32fc_mag-to-fix-issue-96.patch --- volk-1.3/debian/patches/0005-qa-lower-tolerance-for-32fc_mag-to-fix-issue-96.patch 2018-02-04 18:08:35.000000000 +0000 +++ volk-1.4/debian/patches/0005-qa-lower-tolerance-for-32fc_mag-to-fix-issue-96.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,34 +0,0 @@ -From 0f6d889b891bc2ac78c56ad18e43c4ec5a372574 Mon Sep 17 00:00:00 2001 -From: Nathan West -Date: Thu, 4 Aug 2016 11:30:55 -0400 -Subject: [PATCH 05/18] qa: lower tolerance for 32fc_mag to fix issue #96 - ---- - lib/kernel_tests.h | 4 +++- - 1 file changed, 3 insertions(+), 1 deletion(-) - -diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h -index 2bf1f0c..7c82733 100644 ---- a/lib/kernel_tests.h -+++ b/lib/kernel_tests.h -@@ -24,6 +24,8 @@ std::vector init_test_list(volk_test_params_t test_params) - // Some kernels need a lower tolerance - volk_test_params_t test_params_inacc = volk_test_params_t(1e-2, test_params.scalar(), - test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()); -+ volk_test_params_t test_params_inacc_tenth = volk_test_params_t(1e-1, test_params.scalar(), -+ test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()); - volk_test_params_t test_params_int1 = volk_test_params_t(1, test_params.scalar(), - test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()); - -@@ -79,7 +81,7 @@ std::vector init_test_list(volk_test_params_t test_params) - (VOLK_INIT_TEST(volk_32fc_index_max_16u, volk_test_params_t(3, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()))) - (VOLK_INIT_TEST(volk_32fc_index_max_32u, volk_test_params_t(3, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()))) - (VOLK_INIT_TEST(volk_32fc_s32f_magnitude_16i, test_params_int1)) -- (VOLK_INIT_TEST(volk_32fc_magnitude_32f, test_params_inacc)) -+ (VOLK_INIT_TEST(volk_32fc_magnitude_32f, test_params_inacc_tenth)) - (VOLK_INIT_TEST(volk_32fc_magnitude_squared_32f, test_params)) - (VOLK_INIT_TEST(volk_32fc_x2_multiply_32fc, test_params)) - (VOLK_INIT_TEST(volk_32fc_x2_multiply_conjugate_32fc, test_params)) --- -2.11.0 - diff -Nru volk-1.3/debian/patches/0006-Add-NEON-AVX-and-unaligned-versions-of-SSE4.1-and-SS.patch volk-1.4/debian/patches/0006-Add-NEON-AVX-and-unaligned-versions-of-SSE4.1-and-SS.patch --- volk-1.3/debian/patches/0006-Add-NEON-AVX-and-unaligned-versions-of-SSE4.1-and-SS.patch 2018-02-04 18:08:35.000000000 +0000 +++ volk-1.4/debian/patches/0006-Add-NEON-AVX-and-unaligned-versions-of-SSE4.1-and-SS.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,346 +0,0 @@ -From aeaf56828ba0a08728a0cf2d2370b5d0153332b1 Mon Sep 17 00:00:00 2001 -From: Carles Fernandez -Date: Fri, 23 Sep 2016 19:16:27 +0200 -Subject: [PATCH 06/18] Add NEON, AVX and unaligned versions of SSE4.1 and SSE - ---- - kernels/volk/volk_32f_index_max_32u.h | 316 ++++++++++++++++++++++++++++++++++ - 1 file changed, 316 insertions(+) - -diff --git a/kernels/volk/volk_32f_index_max_32u.h b/kernels/volk/volk_32f_index_max_32u.h -index 17b8f70..1888405 100644 ---- a/kernels/volk/volk_32f_index_max_32u.h -+++ b/kernels/volk/volk_32f_index_max_32u.h -@@ -130,6 +130,69 @@ volk_32f_index_max_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t nu - #endif /*LV_HAVE_SSE4_1*/ - - -+#ifdef LV_HAVE_SSE4_1 -+#include -+ -+static inline void volk_32f_index_max_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points) -+{ -+ if(num_points > 0) -+ { -+ uint32_t number = 0; -+ const uint32_t quarterPoints = num_points / 4; -+ -+ float* inputPtr = (float*)src0; -+ -+ __m128 indexIncrementValues = _mm_set1_ps(4); -+ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); -+ -+ float max = src0[0]; -+ float index = 0; -+ __m128 maxValues = _mm_set1_ps(max); -+ __m128 maxValuesIndex = _mm_setzero_ps(); -+ __m128 compareResults; -+ __m128 currentValues; -+ -+ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; -+ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; -+ -+ for(;number < quarterPoints; number++) -+ { -+ currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4; -+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); -+ compareResults = _mm_cmpgt_ps(maxValues, currentValues); -+ maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults); -+ maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults); -+ } -+ -+ // Calculate the largest value from the remaining 4 points -+ _mm_store_ps(maxValuesBuffer, maxValues); -+ _mm_store_ps(maxIndexesBuffer, maxValuesIndex); -+ -+ for(number = 0; number < 4; number++) -+ { -+ if(maxValuesBuffer[number] > max) -+ { -+ index = maxIndexesBuffer[number]; -+ max = maxValuesBuffer[number]; -+ } -+ } -+ -+ number = quarterPoints * 4; -+ for(;number < num_points; number++) -+ { -+ if(src0[number] > max) -+ { -+ index = number; -+ max = src0[number]; -+ } -+ } -+ target[0] = (uint32_t)index; -+ } -+} -+ -+#endif /*LV_HAVE_SSE4_1*/ -+ -+ - #ifdef LV_HAVE_SSE - - #include -@@ -193,6 +256,259 @@ volk_32f_index_max_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_p - #endif /*LV_HAVE_SSE*/ - - -+#ifdef LV_HAVE_SSE -+#include -+ -+static inline void volk_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points) -+{ -+ if(num_points > 0) -+ { -+ uint32_t number = 0; -+ const uint32_t quarterPoints = num_points / 4; -+ -+ float* inputPtr = (float*)src0; -+ -+ __m128 indexIncrementValues = _mm_set1_ps(4); -+ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); -+ -+ float max = src0[0]; -+ float index = 0; -+ __m128 maxValues = _mm_set1_ps(max); -+ __m128 maxValuesIndex = _mm_setzero_ps(); -+ __m128 compareResults; -+ __m128 currentValues; -+ -+ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; -+ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; -+ -+ for(;number < quarterPoints; number++) -+ { -+ currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4; -+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); -+ compareResults = _mm_cmpgt_ps(maxValues, currentValues); -+ maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes)); -+ maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues)); -+ } -+ -+ // Calculate the largest value from the remaining 4 points -+ _mm_store_ps(maxValuesBuffer, maxValues); -+ _mm_store_ps(maxIndexesBuffer, maxValuesIndex); -+ -+ for(number = 0; number < 4; number++) -+ { -+ if(maxValuesBuffer[number] > max) -+ { -+ index = maxIndexesBuffer[number]; -+ max = maxValuesBuffer[number]; -+ } -+ } -+ -+ number = quarterPoints * 4; -+ for(;number < num_points; number++) -+ { -+ if(src0[number] > max) -+ { -+ index = number; -+ max = src0[number]; -+ } -+ } -+ target[0] = (uint32_t)index; -+ } -+} -+ -+#endif /*LV_HAVE_SSE*/ -+ -+ -+#ifdef LV_HAVE_AVX -+#include -+ -+static inline void volk_32f_index_max_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points) -+{ -+ if(num_points > 0) -+ { -+ uint32_t number = 0; -+ const uint32_t quarterPoints = num_points / 8; -+ -+ float* inputPtr = (float*)src0; -+ -+ __m256 indexIncrementValues = _mm256_set1_ps(8); -+ __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8); -+ -+ float max = src0[0]; -+ float index = 0; -+ __m256 maxValues = _mm256_set1_ps(max); -+ __m256 maxValuesIndex = _mm256_setzero_ps(); -+ __m256 compareResults; -+ __m256 currentValues; -+ -+ __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8]; -+ __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8]; -+ -+ for(;number < quarterPoints; number++) -+ { -+ currentValues = _mm256_load_ps(inputPtr); inputPtr += 8; -+ currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); -+ compareResults = _mm256_cmp_ps(maxValues, currentValues, 0x1e); -+ maxValuesIndex = _mm256_blendv_ps(currentIndexes, maxValuesIndex, compareResults); -+ maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults); -+ } -+ -+ // Calculate the largest value from the remaining 8 points -+ _mm256_store_ps(maxValuesBuffer, maxValues); -+ _mm256_store_ps(maxIndexesBuffer, maxValuesIndex); -+ -+ for(number = 0; number < 8; number++) -+ { -+ if(maxValuesBuffer[number] > max) -+ { -+ index = maxIndexesBuffer[number]; -+ max = maxValuesBuffer[number]; -+ } -+ } -+ -+ number = quarterPoints * 8; -+ for(;number < num_points; number++) -+ { -+ if(src0[number] > max) -+ { -+ index = number; -+ max = src0[number]; -+ } -+ } -+ target[0] = (uint32_t)index; -+ } -+} -+ -+#endif /*LV_HAVE_AVX*/ -+ -+ -+#ifdef LV_HAVE_AVX -+#include -+ -+static inline void volk_32f_index_max_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points) -+{ -+ if(num_points > 0) -+ { -+ uint32_t number = 0; -+ const uint32_t quarterPoints = num_points / 8; -+ -+ float* inputPtr = (float*)src0; -+ -+ __m256 indexIncrementValues = _mm256_set1_ps(8); -+ __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8); -+ -+ float max = src0[0]; -+ float index = 0; -+ __m256 maxValues = _mm256_set1_ps(max); -+ __m256 maxValuesIndex = _mm256_setzero_ps(); -+ __m256 compareResults; -+ __m256 currentValues; -+ -+ __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8]; -+ __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8]; -+ -+ for(;number < quarterPoints; number++) -+ { -+ currentValues = _mm256_loadu_ps(inputPtr); inputPtr += 8; -+ currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); -+ compareResults = _mm256_cmp_ps(maxValues, currentValues, 0x1e); -+ maxValuesIndex = _mm256_blendv_ps(currentIndexes, maxValuesIndex, compareResults); -+ maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults); -+ } -+ -+ // Calculate the largest value from the remaining 8 points -+ _mm256_store_ps(maxValuesBuffer, maxValues); -+ _mm256_store_ps(maxIndexesBuffer, maxValuesIndex); -+ -+ for(number = 0; number < 8; number++) -+ { -+ if(maxValuesBuffer[number] > max) -+ { -+ index = maxIndexesBuffer[number]; -+ max = maxValuesBuffer[number]; -+ } -+ } -+ -+ number = quarterPoints * 8; -+ for(;number < num_points; number++) -+ { -+ if(src0[number] > max) -+ { -+ index = number; -+ max = src0[number]; -+ } -+ } -+ target[0] = (uint32_t)index; -+ } -+} -+ -+#endif /*LV_HAVE_AVX*/ -+ -+ -+#ifdef LV_HAVE_NEON -+#include -+ -+static inline void volk_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_points) -+{ -+ if(num_points > 0) -+ { -+ uint32_t number = 0; -+ const uint32_t quarterPoints = num_points / 4; -+ -+ float* inputPtr = (float*)src0; -+ float32x4_t indexIncrementValues = vdupq_n_f32(4); -+ __VOLK_ATTR_ALIGNED(16) float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f }; -+ float32x4_t currentIndexes = vld1q_f32(currentIndexes_float); -+ -+ float max = src0[0]; -+ float index = 0; -+ float32x4_t maxValues = vdupq_n_f32(max); -+ uint32x4_t maxValuesIndex = vmovq_n_u32(0); -+ uint32x4_t compareResults; -+ uint32x4_t currentIndexes_u; -+ float32x4_t currentValues; -+ -+ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; -+ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; -+ -+ for(;number < quarterPoints; number++) -+ { -+ currentValues = vld1q_f32(inputPtr); inputPtr += 4; -+ currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues); -+ currentIndexes_u = vcvtq_u32_f32(currentIndexes); -+ compareResults = vcgtq_f32( maxValues, currentValues); -+ maxValuesIndex = vorrq_u32( vandq_u32( compareResults, maxValuesIndex ), vbicq_u32(currentIndexes_u, compareResults) ); -+ maxValues = vmaxq_f32(currentValues, maxValues); -+ } -+ -+ // Calculate the largest value from the remaining 4 points -+ vst1q_f32(maxValuesBuffer, maxValues); -+ vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex)); -+ for(number = 0; number < 4; number++) -+ { -+ if(maxValuesBuffer[number] > max) -+ { -+ index = maxIndexesBuffer[number]; -+ max = maxValuesBuffer[number]; -+ } -+ } -+ -+ number = quarterPoints * 4; -+ for(;number < num_points; number++) -+ { -+ if(src0[number] > max) -+ { -+ index = number; -+ max = src0[number]; -+ } -+ } -+ target[0] = (uint32_t)index; -+ } -+} -+ -+#endif /*LV_HAVE_NEON*/ -+ -+ - #ifdef LV_HAVE_GENERIC - - static inline void --- -2.11.0 - diff -Nru volk-1.3/debian/patches/0006-Add-sys-time.h-header-copied-from-gnuradio-to-fix-Wi.patch volk-1.4/debian/patches/0006-Add-sys-time.h-header-copied-from-gnuradio-to-fix-Wi.patch --- volk-1.3/debian/patches/0006-Add-sys-time.h-header-copied-from-gnuradio-to-fix-Wi.patch 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/debian/patches/0006-Add-sys-time.h-header-copied-from-gnuradio-to-fix-Wi.patch 2018-05-07 18:45:54.000000000 +0000 @@ -0,0 +1,91 @@ +From 1b4604aa51c60c10dc4b1ac11b9d6d6ce5ced21e Mon Sep 17 00:00:00 2001 +From: Ryan Volz +Date: Fri, 13 Apr 2018 16:37:23 -0400 +Subject: [PATCH 6/9] Add header (copied from gnuradio) to fix + Windows build. + +--- + cmake/msvc/sys/time.h | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 71 insertions(+) + create mode 100644 cmake/msvc/sys/time.h + +diff --git a/cmake/msvc/sys/time.h b/cmake/msvc/sys/time.h +new file mode 100644 +index 0000000..dca0fdf +--- /dev/null ++++ b/cmake/msvc/sys/time.h +@@ -0,0 +1,71 @@ ++#ifndef _MSC_VER // [ ++#error "Use this header only with Microsoft Visual C++ compilers!" ++#endif // _MSC_VER ] ++ ++#ifndef _MSC_SYS_TIME_H_ ++#define _MSC_SYS_TIME_H_ ++ ++//http://social.msdn.microsoft.com/Forums/en/vcgeneral/thread/430449b3-f6dd-4e18-84de-eebd26a8d668 ++#include < time.h > ++#include //I've omitted this line. ++#if defined(_MSC_VER) || defined(_MSC_EXTENSIONS) ++ #define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64 ++#else ++ #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL ++#endif ++ ++#if _MSC_VER < 1900 ++struct timespec { ++ ++time_t tv_sec; /* Seconds since 00:00:00 GMT, */ ++ ++/* 1 January 1970 */ ++ ++long tv_nsec; /* Additional nanoseconds since */ ++ ++/* tv_sec */ ++ ++}; ++#endif ++ ++struct timezone ++{ ++ int tz_minuteswest; /* minutes W of Greenwich */ ++ int tz_dsttime; /* type of dst correction */ ++}; ++ ++static inline int gettimeofday(struct timeval *tv, struct timezone *tz) ++{ ++ FILETIME ft; ++ unsigned __int64 tmpres = 0; ++ static int tzflag; ++ ++ if (NULL != tv) ++ { ++ GetSystemTimeAsFileTime(&ft); ++ ++ tmpres |= ft.dwHighDateTime; ++ tmpres <<= 32; ++ tmpres |= ft.dwLowDateTime; ++ ++ /*converting file time to unix epoch*/ ++ tmpres -= DELTA_EPOCH_IN_MICROSECS; ++ tv->tv_sec = (long)(tmpres / 1000000UL); ++ tv->tv_usec = (long)(tmpres % 1000000UL); ++ } ++ ++ if (NULL != tz) ++ { ++ if (!tzflag) ++ { ++ _tzset(); ++ tzflag++; ++ } ++ tz->tz_minuteswest = _timezone / 60; ++ tz->tz_dsttime = _daylight; ++ } ++ ++ return 0; ++} ++ ++#endif //_MSC_SYS_TIME_H_ +-- +2.11.0 + diff -Nru volk-1.3/debian/patches/0007-added-__VOLK_PREFETCH-compatibility-macro.patch volk-1.4/debian/patches/0007-added-__VOLK_PREFETCH-compatibility-macro.patch --- volk-1.3/debian/patches/0007-added-__VOLK_PREFETCH-compatibility-macro.patch 2018-02-04 18:08:35.000000000 +0000 +++ volk-1.4/debian/patches/0007-added-__VOLK_PREFETCH-compatibility-macro.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,357 +0,0 @@ -From d065c1cdd34c0f5c78911331381e10687faa14a0 Mon Sep 17 00:00:00 2001 -From: Josh Blum -Date: Fri, 20 Jan 2017 10:03:49 -0800 -Subject: [PATCH 07/18] added __VOLK_PREFETCH() compatibility macro - -__VOLK_PREFETCH() performs __builtin_prefetch() on GCC compilers -and is otherwise a NOP for other systems. The use of __builtin_prefetch -was replaced with __VOLK_PREFETCH() to make the kernels portable. ---- - include/volk/volk_common.h | 3 +++ - kernels/volk/volk_16i_max_star_16i.h | 2 +- - kernels/volk/volk_16i_max_star_horizontal_16i.h | 2 +- - kernels/volk/volk_16ic_convert_32fc.h | 2 +- - kernels/volk/volk_16ic_x2_dot_prod_16ic.h | 28 +++++++++++----------- - kernels/volk/volk_16ic_x2_multiply_16ic.h | 4 ++-- - kernels/volk/volk_32f_x2_add_32f.h | 4 ++-- - kernels/volk/volk_32fc_conjugate_32fc.h | 2 +- - kernels/volk/volk_32fc_convert_16ic.h | 6 ++--- - .../volk/volk_32fc_x2_conjugate_dot_prod_32fc.h | 4 ++-- - kernels/volk/volk_32fc_x2_dot_prod_32fc.h | 16 ++++++------- - kernels/volk/volk_32fc_x2_multiply_32fc.h | 8 +++---- - .../volk/volk_32fc_x2_multiply_conjugate_32fc.h | 4 ++-- - 13 files changed, 44 insertions(+), 41 deletions(-) - -diff --git a/include/volk/volk_common.h b/include/volk/volk_common.h -index 4d35f5c..a53b139 100644 ---- a/include/volk/volk_common.h -+++ b/include/volk/volk_common.h -@@ -16,6 +16,7 @@ - # define __VOLK_ATTR_EXPORT - # define __VOLK_ATTR_IMPORT - # endif -+# define __VOLK_PREFETCH(addr) __builtin_prefetch(addr) - #elif _MSC_VER - # define __VOLK_ATTR_ALIGNED(x) __declspec(align(x)) - # define __VOLK_ATTR_UNUSED -@@ -23,6 +24,7 @@ - # define __VOLK_ATTR_DEPRECATED __declspec(deprecated) - # define __VOLK_ATTR_EXPORT __declspec(dllexport) - # define __VOLK_ATTR_IMPORT __declspec(dllimport) -+# define __VOLK_PREFETCH(addr) - #else - # define __VOLK_ATTR_ALIGNED(x) - # define __VOLK_ATTR_UNUSED -@@ -30,6 +32,7 @@ - # define __VOLK_ATTR_DEPRECATED - # define __VOLK_ATTR_EXPORT - # define __VOLK_ATTR_IMPORT -+# define __VOLK_PREFETCH(addr) - #endif - - //////////////////////////////////////////////////////////////////////// -diff --git a/kernels/volk/volk_16i_max_star_16i.h b/kernels/volk/volk_16i_max_star_16i.h -index e470642..531a8b5 100644 ---- a/kernels/volk/volk_16i_max_star_16i.h -+++ b/kernels/volk/volk_16i_max_star_16i.h -@@ -139,7 +139,7 @@ volk_16i_max_star_16i_neon(short* target, short* src0, unsigned int num_points) - - for(number=0; number < eighth_points; ++number) { - input_vec = vld1q_s16(src0); -- __builtin_prefetch(src0+16); -+ __VOLK_PREFETCH(src0+16); - diff = vsubq_s16(candidate_vec, input_vec); - comp1 = vcgeq_s16(diff, zeros); - comp2 = vcltq_s16(diff, zeros); -diff --git a/kernels/volk/volk_16i_max_star_horizontal_16i.h b/kernels/volk/volk_16i_max_star_horizontal_16i.h -index 1da8356..964587c 100644 ---- a/kernels/volk/volk_16i_max_star_horizontal_16i.h -+++ b/kernels/volk/volk_16i_max_star_horizontal_16i.h -@@ -169,7 +169,7 @@ volk_16i_max_star_horizontal_16i_neon(int16_t* target, int16_t* src0, unsigned i - zeros = veorq_s16(zeros, zeros); - for(number=0; number < eighth_points; ++number) { - input_vec = vld2q_s16(src0); -- //__builtin_prefetch(src0+16); -+ //__VOLK_PREFETCH(src0+16); - diff = vsubq_s16(input_vec.val[0], input_vec.val[1]); - comp1 = vcgeq_s16(diff, zeros); - comp2 = vcltq_s16(diff, zeros); -diff --git a/kernels/volk/volk_16ic_convert_32fc.h b/kernels/volk/volk_16ic_convert_32fc.h -index 88e079d..9779b0f 100644 ---- a/kernels/volk/volk_16ic_convert_32fc.h -+++ b/kernels/volk/volk_16ic_convert_32fc.h -@@ -198,7 +198,7 @@ static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector, const lv - for(number = 0; number < sse_iters; number++) - { - a16x4 = vld1_s16((const int16_t*)_in); -- __builtin_prefetch(_in + 4); -+ __VOLK_PREFETCH(_in + 4); - a32x4 = vmovl_s16(a16x4); - f32x4 = vcvtq_f32_s32(a32x4); - vst1q_f32((float32_t*)_out, f32x4); -diff --git a/kernels/volk/volk_16ic_x2_dot_prod_16ic.h b/kernels/volk/volk_16ic_x2_dot_prod_16ic.h -index 9d4c882..8e6de4c 100644 ---- a/kernels/volk/volk_16ic_x2_dot_prod_16ic.h -+++ b/kernels/volk/volk_16ic_x2_dot_prod_16ic.h -@@ -96,9 +96,9 @@ static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, const lv_16 - { - // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] - a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg -- __builtin_prefetch(_in_a + 8); -+ __VOLK_PREFETCH(_in_a + 8); - b = _mm_load_si128((__m128i*)_in_b); -- __builtin_prefetch(_in_b + 8); -+ __VOLK_PREFETCH(_in_b + 8); - c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... - - c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. -@@ -173,9 +173,9 @@ static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, const lv_16 - { - // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] - a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg -- __builtin_prefetch(_in_a + 8); -+ __VOLK_PREFETCH(_in_a + 8); - b = _mm_loadu_si128((__m128i*)_in_b); -- __builtin_prefetch(_in_b + 8); -+ __VOLK_PREFETCH(_in_b + 8); - c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... - - c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. -@@ -248,9 +248,9 @@ static inline void volk_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, const lv_16 - for(number = 0; number < avx_iters; number++) - { - a = _mm256_loadu_si256((__m256i*)_in_a); -- __builtin_prefetch(_in_a + 16); -+ __VOLK_PREFETCH(_in_a + 16); - b = _mm256_loadu_si256((__m256i*)_in_b); -- __builtin_prefetch(_in_b + 16); -+ __VOLK_PREFETCH(_in_b + 16); - c = _mm256_mullo_epi16(a, b); - - c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. -@@ -324,9 +324,9 @@ static inline void volk_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, const lv_16 - for(number = 0; number < avx_iters; number++) - { - a = _mm256_load_si256((__m256i*)_in_a); -- __builtin_prefetch(_in_a + 16); -+ __VOLK_PREFETCH(_in_a + 16); - b = _mm256_load_si256((__m256i*)_in_b); -- __builtin_prefetch(_in_b + 16); -+ __VOLK_PREFETCH(_in_b + 16); - c = _mm256_mullo_epi16(a, b); - - c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. -@@ -399,8 +399,8 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const lv_16sc - { - a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i - b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i -- __builtin_prefetch(a_ptr + 8); -- __builtin_prefetch(b_ptr + 8); -+ __VOLK_PREFETCH(a_ptr + 8); -+ __VOLK_PREFETCH(b_ptr + 8); - - // multiply the real*real and imag*imag to get real result - // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r -@@ -465,8 +465,8 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, const lv_ - { - a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i - b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i -- __builtin_prefetch(a_ptr + 8); -- __builtin_prefetch(b_ptr + 8); -+ __VOLK_PREFETCH(a_ptr + 8); -+ __VOLK_PREFETCH(b_ptr + 8); - - tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]); - tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]); -@@ -519,8 +519,8 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out, const - { - a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i - b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i -- __builtin_prefetch(a_ptr + 8); -- __builtin_prefetch(b_ptr + 8); -+ __VOLK_PREFETCH(a_ptr + 8); -+ __VOLK_PREFETCH(b_ptr + 8); - - // use 2 accumulators to remove inter-instruction data dependencies - accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]); -diff --git a/kernels/volk/volk_16ic_x2_multiply_16ic.h b/kernels/volk/volk_16ic_x2_multiply_16ic.h -index 17033ae..9dcf06f 100644 ---- a/kernels/volk/volk_16ic_x2_multiply_16ic.h -+++ b/kernels/volk/volk_16ic_x2_multiply_16ic.h -@@ -291,8 +291,8 @@ static inline void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, const lv_16sc - { - a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i - b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i -- __builtin_prefetch(a_ptr + 4); -- __builtin_prefetch(b_ptr + 4); -+ __VOLK_PREFETCH(a_ptr + 4); -+ __VOLK_PREFETCH(b_ptr + 4); - - // multiply the real*real and imag*imag to get real result - // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r -diff --git a/kernels/volk/volk_32f_x2_add_32f.h b/kernels/volk/volk_32f_x2_add_32f.h -index fc9cf5b..28cf73d 100644 ---- a/kernels/volk/volk_32f_x2_add_32f.h -+++ b/kernels/volk/volk_32f_x2_add_32f.h -@@ -191,8 +191,8 @@ volk_32f_x2_add_32f_u_neon(float* cVector, const float* aVector, - // Load in to NEON registers - aVal = vld1q_f32(aPtr); - bVal = vld1q_f32(bPtr); -- __builtin_prefetch(aPtr+4); -- __builtin_prefetch(bPtr+4); -+ __VOLK_PREFETCH(aPtr+4); -+ __VOLK_PREFETCH(bPtr+4); - - // vector add - cVal = vaddq_f32(aVal, bVal); -diff --git a/kernels/volk/volk_32fc_conjugate_32fc.h b/kernels/volk/volk_32fc_conjugate_32fc.h -index 1fdb6c2..6994d0e 100644 ---- a/kernels/volk/volk_32fc_conjugate_32fc.h -+++ b/kernels/volk/volk_32fc_conjugate_32fc.h -@@ -248,7 +248,7 @@ volk_32fc_conjugate_32fc_a_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, un - const lv_32fc_t* a = aVector; - - for(number=0; number < quarterPoints; number++){ -- __builtin_prefetch(a+4); -+ __VOLK_PREFETCH(a+4); - x = vld2q_f32((float*)a); // Load the complex data as ar,br,cr,dr; ai,bi,ci,di - - // xor the imaginary lane -diff --git a/kernels/volk/volk_32fc_convert_16ic.h b/kernels/volk/volk_32fc_convert_16ic.h -index 4f6e6a5..307ab36 100644 ---- a/kernels/volk/volk_32fc_convert_16ic.h -+++ b/kernels/volk/volk_32fc_convert_16ic.h -@@ -75,7 +75,7 @@ static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const - { - inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; -- __builtin_prefetch(inputVectorPtr + 8); -+ __VOLK_PREFETCH(inputVectorPtr + 8); - - // Clip - ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); -@@ -128,7 +128,7 @@ static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const - { - inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; -- __builtin_prefetch(inputVectorPtr + 8); -+ __VOLK_PREFETCH(inputVectorPtr + 8); - - // Clip - ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); -@@ -184,7 +184,7 @@ static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector, const lv - { - a = vld1q_f32((const float32_t*)(inputVectorPtr)); inputVectorPtr += 4; - b = vld1q_f32((const float32_t*)(inputVectorPtr)); inputVectorPtr += 4; -- __builtin_prefetch(inputVectorPtr + 8); -+ __VOLK_PREFETCH(inputVectorPtr + 8); - - ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); - ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val); -diff --git a/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h b/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h -index 981899c..4addf80 100644 ---- a/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h -+++ b/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h -@@ -219,8 +219,8 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result, - for(number = 0; number < quarter_points; ++number) { - a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i - b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i -- __builtin_prefetch(a_ptr+8); -- __builtin_prefetch(b_ptr+8); -+ __VOLK_PREFETCH(a_ptr+8); -+ __VOLK_PREFETCH(b_ptr+8); - - // do the first multiply - tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); -diff --git a/kernels/volk/volk_32fc_x2_dot_prod_32fc.h b/kernels/volk/volk_32fc_x2_dot_prod_32fc.h -index 39d0c78..0c3271c 100644 ---- a/kernels/volk/volk_32fc_x2_dot_prod_32fc.h -+++ b/kernels/volk/volk_32fc_x2_dot_prod_32fc.h -@@ -894,8 +894,8 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, const lv_3 - for(number = 0; number < quarter_points; ++number) { - a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i - b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i -- __builtin_prefetch(a_ptr+8); -- __builtin_prefetch(b_ptr+8); -+ __VOLK_PREFETCH(a_ptr+8); -+ __VOLK_PREFETCH(b_ptr+8); - - // multiply the real*real and imag*imag to get real result - // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r -@@ -949,8 +949,8 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, c - for(number = 0; number < quarter_points; ++number) { - a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i - b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i -- __builtin_prefetch(a_ptr+8); -- __builtin_prefetch(b_ptr+8); -+ __VOLK_PREFETCH(a_ptr+8); -+ __VOLK_PREFETCH(b_ptr+8); - - // do the first multiply - tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); -@@ -998,8 +998,8 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result, con - for(number = 0; number < quarter_points; ++number) { - a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i - b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i -- __builtin_prefetch(a_ptr+8); -- __builtin_prefetch(b_ptr+8); -+ __VOLK_PREFETCH(a_ptr+8); -+ __VOLK_PREFETCH(b_ptr+8); - - // use 2 accumulators to remove inter-instruction data dependencies - accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]); -@@ -1050,8 +1050,8 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* resul - for(number = 0; number < quarter_points; ++number) { - a_val = vld4q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i - b_val = vld4q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i -- __builtin_prefetch(a_ptr+8); -- __builtin_prefetch(b_ptr+8); -+ __VOLK_PREFETCH(a_ptr+8); -+ __VOLK_PREFETCH(b_ptr+8); - - // use 2 accumulators to remove inter-instruction data dependencies - accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]); -diff --git a/kernels/volk/volk_32fc_x2_multiply_32fc.h b/kernels/volk/volk_32fc_x2_multiply_32fc.h -index 1709140..0b9d3fe 100644 ---- a/kernels/volk/volk_32fc_x2_multiply_32fc.h -+++ b/kernels/volk/volk_32fc_x2_multiply_32fc.h -@@ -372,8 +372,8 @@ volk_32fc_x2_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, - for(number = 0; number < quarter_points; ++number) { - a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i - b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i -- __builtin_prefetch(a_ptr+4); -- __builtin_prefetch(b_ptr+4); -+ __VOLK_PREFETCH(a_ptr+4); -+ __VOLK_PREFETCH(b_ptr+4); - - // multiply the real*real and imag*imag to get real result - // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r -@@ -420,8 +420,8 @@ volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t* cVector, const lv_32fc_t* aV - for(number = 0; number < quarter_points; ++number) { - a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i - b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i -- __builtin_prefetch(a_ptr+4); -- __builtin_prefetch(b_ptr+4); -+ __VOLK_PREFETCH(a_ptr+4); -+ __VOLK_PREFETCH(b_ptr+4); - - // do the first multiply - tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); -diff --git a/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h b/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h -index 703c78d..c13a32e 100644 ---- a/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h -+++ b/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h -@@ -262,8 +262,8 @@ volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* a - a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i - b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - b_val.val[1] = vnegq_f32(b_val.val[1]); -- __builtin_prefetch(a_ptr+4); -- __builtin_prefetch(b_ptr+4); -+ __VOLK_PREFETCH(a_ptr+4); -+ __VOLK_PREFETCH(b_ptr+4); - - // multiply the real*real and imag*imag to get real result - // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r --- -2.11.0 - diff -Nru volk-1.3/debian/patches/0007-Fix-compile-on-Windows-by-avoiding-min-max-macros-in.patch volk-1.4/debian/patches/0007-Fix-compile-on-Windows-by-avoiding-min-max-macros-in.patch --- volk-1.3/debian/patches/0007-Fix-compile-on-Windows-by-avoiding-min-max-macros-in.patch 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/debian/patches/0007-Fix-compile-on-Windows-by-avoiding-min-max-macros-in.patch 2018-05-07 18:45:54.000000000 +0000 @@ -0,0 +1,29 @@ +From 8eef886bb94b927364314877164600701678e864 Mon Sep 17 00:00:00 2001 +From: Ryan Volz +Date: Tue, 17 Apr 2018 10:23:32 -0400 +Subject: [PATCH 7/9] Fix compile on Windows by avoiding min/max macros in + windows.h. + +--- + cmake/msvc/sys/time.h | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/cmake/msvc/sys/time.h b/cmake/msvc/sys/time.h +index dca0fdf..aa0f5dc 100644 +--- a/cmake/msvc/sys/time.h ++++ b/cmake/msvc/sys/time.h +@@ -5,6 +5,11 @@ + #ifndef _MSC_SYS_TIME_H_ + #define _MSC_SYS_TIME_H_ + ++// prevent windows.h from clobbering min and max functions with macros ++#ifndef NOMINMAX ++#define NOMINMAX ++#endif ++ + //http://social.msdn.microsoft.com/Forums/en/vcgeneral/thread/430449b3-f6dd-4e18-84de-eebd26a8d668 + #include < time.h > + #include //I've omitted this line. +-- +2.11.0 + diff -Nru volk-1.3/debian/patches/0008-Fix-add_test-for-Windows.patch volk-1.4/debian/patches/0008-Fix-add_test-for-Windows.patch --- volk-1.3/debian/patches/0008-Fix-add_test-for-Windows.patch 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/debian/patches/0008-Fix-add_test-for-Windows.patch 2018-05-07 18:45:54.000000000 +0000 @@ -0,0 +1,71 @@ +From c88395979c25a952b2c8deb9a37138fa2dc6e9e7 Mon Sep 17 00:00:00 2001 +From: Ryan Volz +Date: Tue, 17 Apr 2018 13:45:38 -0400 +Subject: [PATCH 8/9] Fix add_test for Windows. + +--- + cmake/Modules/VolkAddTest.cmake | 35 +++++++++++++++++++---------------- + 1 file changed, 19 insertions(+), 16 deletions(-) + +diff --git a/cmake/Modules/VolkAddTest.cmake b/cmake/Modules/VolkAddTest.cmake +index 16c04f3..46f35f0 100644 +--- a/cmake/Modules/VolkAddTest.cmake ++++ b/cmake/Modules/VolkAddTest.cmake +@@ -176,20 +176,19 @@ function(VOLK_ADD_TEST test_name executable_name) + #set them in the PATH to run tests. The following appends the + #path of a target dependency. + # +- #NOTE: get_target_property LOCATION is being deprecated as of +- #CMake 3.2.0, which just prints a warning & notes that this +- #functionality will be removed in the future. Leave it here for +- #now until someone can figure out how to do this in Windows. +- foreach(target ${test_name} ${VOLK_TEST_TARGET_DEPS}) +- get_target_property(location "${target}" LOCATION) +- if(location) +- get_filename_component(path ${location} PATH) +- string(REGEX REPLACE "\\$\$.*\$" ${CMAKE_BUILD_TYPE} path ${path}) +- list(APPEND libpath ${path}) +- endif(location) +- endforeach(target) +- +- list(APPEND libpath ${DLL_PATHS} "%PATH%") ++ #create a list of target directories to be determined by the ++ #"add_test" command, via the $ operator; make sure the ++ #test's directory is first, since it ($1) is prepended to PATH. ++ unset(TARGET_DIR_LIST) ++ foreach(target ${executable_name} ${VOLK_TEST_TARGET_DEPS}) ++ list(APPEND TARGET_DIR_LIST "$") ++ endforeach() ++ #replace list separator with the path separator (escaped) ++ string(REPLACE ";" "\\\\;" TARGET_DIR_LIST "${TARGET_DIR_LIST}") ++ ++ #add command line argument (TARGET_DIR_LIST) to path and append current path ++ list(INSERT libpath 0 "%1") ++ list(APPEND libpath "%PATH%") + + #replace list separator with the path separator (escaped) + string(REPLACE ";" "\\;" libpath "${libpath}") +@@ -204,14 +203,18 @@ function(VOLK_ADD_TEST test_name executable_name) + file(APPEND ${bat_file} "SET ${environ}\n") + endforeach(environ) + ++ set(VOLK_TEST_ARGS "${test_name}") ++ + #redo the test args to have a space between each + string(REPLACE ";" " " VOLK_TEST_ARGS "${VOLK_TEST_ARGS}") + + #finally: append the test name to execute +- file(APPEND ${bat_file} ${test_name} " " ${VOLK_TEST_ARGS} "\n") ++ file(APPEND ${bat_file} "${executable_name} ${VOLK_TEST_ARGS}\n") + file(APPEND ${bat_file} "\n") + +- add_test(${test_name} ${bat_file}) ++ add_test(NAME qa_${test_name} ++ COMMAND ${bat_file} ${TARGET_DIR_LIST} ++ ) + endif(WIN32) + + endfunction(VOLK_ADD_TEST) +-- +2.11.0 + diff -Nru volk-1.3/debian/patches/0008-Fix-bug-106-volk_64u_popcnt-bug-in-generic-implement.patch volk-1.4/debian/patches/0008-Fix-bug-106-volk_64u_popcnt-bug-in-generic-implement.patch --- volk-1.3/debian/patches/0008-Fix-bug-106-volk_64u_popcnt-bug-in-generic-implement.patch 2018-02-04 18:08:35.000000000 +0000 +++ volk-1.4/debian/patches/0008-Fix-bug-106-volk_64u_popcnt-bug-in-generic-implement.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,26 +0,0 @@ -From b0b9615e4e5d38c0d8d6bcc06ccefe08682ec352 Mon Sep 17 00:00:00 2001 -From: Nick Foster -Date: Fri, 20 Jan 2017 16:36:01 -0800 -Subject: [PATCH 08/18] Fix bug 106 (volk_64u_popcnt bug in generic - implementation) - ---- - kernels/volk/volk_64u_popcnt.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernels/volk/volk_64u_popcnt.h b/kernels/volk/volk_64u_popcnt.h -index 653bfb9..cbce2ec 100644 ---- a/kernels/volk/volk_64u_popcnt.h -+++ b/kernels/volk/volk_64u_popcnt.h -@@ -84,7 +84,7 @@ volk_64u_popcnt_generic(uint64_t* ret, const uint64_t value) - uint64_t retVal64 = retVal; - - //retVal = valueVector[1]; -- retVal = (uint32_t)((value & 0xFFFFFFFF00000000ull) >> 31); -+ retVal = (uint32_t)((value & 0xFFFFFFFF00000000ull) >> 32); - retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); - retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); - retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; --- -2.11.0 - diff -Nru volk-1.3/debian/patches/0009-Fix-32u_reverse_32u-for-ARM.patch volk-1.4/debian/patches/0009-Fix-32u_reverse_32u-for-ARM.patch --- volk-1.3/debian/patches/0009-Fix-32u_reverse_32u-for-ARM.patch 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/debian/patches/0009-Fix-32u_reverse_32u-for-ARM.patch 2018-05-07 18:45:54.000000000 +0000 @@ -0,0 +1,31 @@ +From 297fefdd16e5a230f094e0f4ac2918f0586154be Mon Sep 17 00:00:00 2001 +From: Philip Balister +Date: Fri, 27 Apr 2018 10:54:33 +0000 +Subject: [PATCH 9/9] Fix 32u_reverse_32u for ARM. + + * Order of operands in inline asm reversed. + * Worked for some tune setting since gcc generated rbit lr, lr so + order didn't matter. For other tune settings it generated rbit r3, r2 + which would fail QA test. + +Signed-off-by: Philip Balister +--- + kernels/volk/volk_32u_reverse_32u.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernels/volk/volk_32u_reverse_32u.h b/kernels/volk/volk_32u_reverse_32u.h +index acdbacd..417a5d1 100644 +--- a/kernels/volk/volk_32u_reverse_32u.h ++++ b/kernels/volk/volk_32u_reverse_32u.h +@@ -337,7 +337,7 @@ static inline void volk_32u_reverse_32u_bintree_permute_bottom_up(uint32_t* out, + #include + + #define DO_RBIT \ +- asm("rbit %1,%0" : "=r" (*out_ptr) : "r" (*in_ptr)); \ ++ asm("rbit %0,%1" : "=r" (*out_ptr) : "r" (*in_ptr)); \ + in_ptr++; \ + out_ptr++; + +-- +2.11.0 + diff -Nru volk-1.3/debian/patches/0009-modtool-deconflict-module-include-guards-from-main-v.patch volk-1.4/debian/patches/0009-modtool-deconflict-module-include-guards-from-main-v.patch --- volk-1.3/debian/patches/0009-modtool-deconflict-module-include-guards-from-main-v.patch 2018-02-04 18:08:35.000000000 +0000 +++ volk-1.4/debian/patches/0009-modtool-deconflict-module-include-guards-from-main-v.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,39 +0,0 @@ -From 5af8aa45fa23f72aff8593f54e7b67e449927681 Mon Sep 17 00:00:00 2001 -From: Nathan West -Date: Mon, 13 Mar 2017 12:25:35 -0400 -Subject: [PATCH 09/18] modtool: deconflict module include guards from main - volk - ---- - python/volk_modtool/volk_modtool_generate.py | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/python/volk_modtool/volk_modtool_generate.py b/python/volk_modtool/volk_modtool_generate.py -index 83e0d26..6040a7d 100644 ---- a/python/volk_modtool/volk_modtool_generate.py -+++ b/python/volk_modtool/volk_modtool_generate.py -@@ -98,6 +98,9 @@ class volk_modtool: - os.makedirs(os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'kernels/volk_' + self.my_dict['name'])) - - current_kernel_names = self.get_current_kernels() -+ need_ifdef_updates = ["constant.h", "volk_complex.h", "volk_malloc.h", "volk_prefs.h", -+ "volk_common.h", "volk_cpu.tmpl.h", "volk_config_fixed.tmpl.h", -+ "volk_typedefs.h", "volk.tmpl.h"] - for root, dirnames, filenames in os.walk(self.my_dict['base']): - for name in filenames: - t_table = map(lambda a: re.search(a, name), current_kernel_names) -@@ -107,10 +110,7 @@ class volk_modtool: - instring = open(infile, 'r').read() - outstring = re.sub(self.volk, 'volk_' + self.my_dict['name'], instring) - # Update the header ifdef guards only where needed -- if((name == "constants.h") or -- (name == "volk_complex.h") or -- (name == "volk_malloc.h") or -- (name == "volk_prefs.h")): -+ if name in need_ifdef_updates: - outstring = re.sub(self.volk_included, 'INCLUDED_VOLK_' + self.my_dict['name'].upper(), outstring) - newname = re.sub(self.volk, 'volk_' + self.my_dict['name'], name) - relpath = os.path.relpath(infile, self.my_dict['base']) --- -2.11.0 - diff -Nru volk-1.3/debian/patches/0010-modtool-update-the-cmake-find-module-for-volk-mods.patch volk-1.4/debian/patches/0010-modtool-update-the-cmake-find-module-for-volk-mods.patch --- volk-1.3/debian/patches/0010-modtool-update-the-cmake-find-module-for-volk-mods.patch 2018-02-04 18:08:35.000000000 +0000 +++ volk-1.4/debian/patches/0010-modtool-update-the-cmake-find-module-for-volk-mods.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,27 +0,0 @@ -From 663dbd00b3e4bd3ddb0b7f8a9360df132d7f0d56 Mon Sep 17 00:00:00 2001 -From: Nathan West -Date: Mon, 13 Mar 2017 12:37:18 -0400 -Subject: [PATCH 10/18] modtool: update the cmake find module for volk mods - ---- - python/volk_modtool/volk_modtool_generate.py | 4 ++++ - 1 file changed, 4 insertions(+) - -diff --git a/python/volk_modtool/volk_modtool_generate.py b/python/volk_modtool/volk_modtool_generate.py -index 6040a7d..75232ed 100644 ---- a/python/volk_modtool/volk_modtool_generate.py -+++ b/python/volk_modtool/volk_modtool_generate.py -@@ -113,6 +113,10 @@ class volk_modtool: - if name in need_ifdef_updates: - outstring = re.sub(self.volk_included, 'INCLUDED_VOLK_' + self.my_dict['name'].upper(), outstring) - newname = re.sub(self.volk, 'volk_' + self.my_dict['name'], name) -+ if name == 'VolkConfig.cmake.in': -+ outstring = re.sub("VOLK", 'VOLK_' + self.my_dict['name'].upper(), outstring) -+ newname = "Volk%sConfig.cmake.in" % self.my_dict['name'] -+ - relpath = os.path.relpath(infile, self.my_dict['base']) - newrelpath = re.sub(self.volk, 'volk_' + self.my_dict['name'], relpath) - dest = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], os.path.dirname(newrelpath), newname) --- -2.11.0 - diff -Nru volk-1.3/debian/patches/0011-Use-powf-to-match-variables-and-avoid-implicit-type-.patch volk-1.4/debian/patches/0011-Use-powf-to-match-variables-and-avoid-implicit-type-.patch --- volk-1.3/debian/patches/0011-Use-powf-to-match-variables-and-avoid-implicit-type-.patch 2018-02-04 18:08:35.000000000 +0000 +++ volk-1.4/debian/patches/0011-Use-powf-to-match-variables-and-avoid-implicit-type-.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,44 +0,0 @@ -From 28b03a9a338dc24b002413e880222fe1d49f77f5 Mon Sep 17 00:00:00 2001 -From: Michael Dickens -Date: Sat, 1 Apr 2017 15:24:46 -0400 -Subject: [PATCH 11/18] Use 'powf' to match variables and avoid implicit type - converstion. Makes some older compilers happy, allowing 'make test' to pass. - ---- - kernels/volk/volk_32f_x2_pow_32f.h | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/kernels/volk/volk_32f_x2_pow_32f.h b/kernels/volk/volk_32f_x2_pow_32f.h -index 58fecb6..a8cb2e1 100644 ---- a/kernels/volk/volk_32f_x2_pow_32f.h -+++ b/kernels/volk/volk_32f_x2_pow_32f.h -@@ -190,7 +190,7 @@ volk_32f_x2_pow_32f_a_sse4_1(float* cVector, const float* bVector, - - number = quarterPoints * 4; - for(;number < num_points; number++){ -- *cPtr++ = pow(*aPtr++, *bPtr++); -+ *cPtr++ = powf(*aPtr++, *bPtr++); - } - } - -@@ -215,7 +215,7 @@ volk_32f_x2_pow_32f_generic(float* cVector, const float* bVector, - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ -- *cPtr++ = pow(*aPtr++, *bPtr++); -+ *cPtr++ = powf(*aPtr++, *bPtr++); - } - } - #endif /* LV_HAVE_GENERIC */ -@@ -326,7 +326,7 @@ volk_32f_x2_pow_32f_u_sse4_1(float* cVector, const float* bVector, - - number = quarterPoints * 4; - for(;number < num_points; number++){ -- *cPtr++ = pow(*aPtr++, *bPtr++); -+ *cPtr++ = powf(*aPtr++, *bPtr++); - } - } - --- -2.11.0 - diff -Nru volk-1.3/debian/patches/0012-cmake-support-empty-CMAKE_INSTALL_PREFIX.patch volk-1.4/debian/patches/0012-cmake-support-empty-CMAKE_INSTALL_PREFIX.patch --- volk-1.3/debian/patches/0012-cmake-support-empty-CMAKE_INSTALL_PREFIX.patch 2018-02-04 18:08:35.000000000 +0000 +++ volk-1.4/debian/patches/0012-cmake-support-empty-CMAKE_INSTALL_PREFIX.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,26 +0,0 @@ -From 67202d7b46f9ce55625d0ce5c3a2d98dff56b09a Mon Sep 17 00:00:00 2001 -From: Josh Blum -Date: Wed, 5 Oct 2016 14:09:05 -0700 -Subject: [PATCH 12/18] cmake: support empty CMAKE_INSTALL_PREFIX - -Needed quotes for the string escape command ---- - lib/CMakeLists.txt | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt -index ad5653c..45b6c51 100644 ---- a/lib/CMakeLists.txt -+++ b/lib/CMakeLists.txt -@@ -489,7 +489,7 @@ endif() - message(STATUS "Loading version ${VERSION} into constants...") - - #double escape for windows backslash path separators --string(REPLACE "\\" "\\\\" prefix ${prefix}) -+string(REPLACE "\\" "\\\\" prefix "${prefix}") - - configure_file( - ${CMAKE_CURRENT_SOURCE_DIR}/constants.c.in --- -2.11.0 - diff -Nru volk-1.3/debian/patches/0013-Support-relocated-install-with-VOLK_PREFIX-env-var.patch volk-1.4/debian/patches/0013-Support-relocated-install-with-VOLK_PREFIX-env-var.patch --- volk-1.3/debian/patches/0013-Support-relocated-install-with-VOLK_PREFIX-env-var.patch 2018-02-04 18:08:35.000000000 +0000 +++ volk-1.4/debian/patches/0013-Support-relocated-install-with-VOLK_PREFIX-env-var.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,35 +0,0 @@ -From 0d1065854848494f211c990ed26267565cc44647 Mon Sep 17 00:00:00 2001 -From: Josh Blum -Date: Thu, 6 Oct 2016 15:06:09 -0700 -Subject: [PATCH 13/18] Support relocated install with VOLK_PREFIX env var - -Some packaging systems such as snaps will install -the volk library to a dynamically chosen location. -The install script can set an evironment variable -so that the library reports the correct prefix. ---- - lib/constants.c.in | 3 +++ - 1 file changed, 3 insertions(+) - -diff --git a/lib/constants.c.in b/lib/constants.c.in -index 3839f53..a81c7cb 100644 ---- a/lib/constants.c.in -+++ b/lib/constants.c.in -@@ -24,11 +24,14 @@ - #include - #endif - -+#include - #include - - char* - volk_prefix() - { -+ const char *prefix = getenv("VOLK_PREFIX"); -+ if (prefix != NULL) return (char *)prefix; - return "@prefix@"; - } - --- -2.11.0 - diff -Nru volk-1.3/debian/patches/0014-Fixing-a-minimal-bug-in-the-log2-docstring.patch volk-1.4/debian/patches/0014-Fixing-a-minimal-bug-in-the-log2-docstring.patch --- volk-1.3/debian/patches/0014-Fixing-a-minimal-bug-in-the-log2-docstring.patch 2018-02-04 18:08:35.000000000 +0000 +++ volk-1.4/debian/patches/0014-Fixing-a-minimal-bug-in-the-log2-docstring.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,25 +0,0 @@ -From ee70be38a66beb5eb236a3ffb3fc147a5d053979 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Marcus=20M=C3=BCller?= -Date: Mon, 20 Nov 2017 15:12:06 +0100 -Subject: [PATCH 14/18] Fixing a minimal bug in the log2 docstring - ---- - kernels/volk/volk_32f_log2_32f.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernels/volk/volk_32f_log2_32f.h b/kernels/volk/volk_32f_log2_32f.h -index 6704129..c3bfeaa 100644 ---- a/kernels/volk/volk_32f_log2_32f.h -+++ b/kernels/volk/volk_32f_log2_32f.h -@@ -62,7 +62,7 @@ - * \li num_points: The number of data points. - * - * \b Outputs -- * \li cVector: The output vector. -+ * \li bVector: The output vector. - * - * \b Example - * \code --- -2.11.0 - diff -Nru volk-1.3/debian/patches/0015-kernel-Adds-unaligned-protokernles-to-32f_x2_s32f_in.patch volk-1.4/debian/patches/0015-kernel-Adds-unaligned-protokernles-to-32f_x2_s32f_in.patch --- volk-1.3/debian/patches/0015-kernel-Adds-unaligned-protokernles-to-32f_x2_s32f_in.patch 2018-02-04 18:08:35.000000000 +0000 +++ volk-1.4/debian/patches/0015-kernel-Adds-unaligned-protokernles-to-32f_x2_s32f_in.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,140 +0,0 @@ -From 82a88672d80ef7652548182e819e726874e0adc0 Mon Sep 17 00:00:00 2001 -From: Damian Miralles -Date: Wed, 13 Dec 2017 13:27:17 -0700 -Subject: [PATCH 15/18] kernel: Adds unaligned protokernles to - `32f_x2_s32f_interleave_16ic` and `32f_x2_subtract_32f` - -Adds unaligned versions to the afore mentioned kernels, relative speeds -improvements shown in both cases. ---- - kernels/volk/volk_32f_x2_s32f_interleave_16ic.h | 63 +++++++++++++++++++++++++ - kernels/volk/volk_32f_x2_subtract_32f.h | 45 ++++++++++++++++++ - 2 files changed, 108 insertions(+) - -diff --git a/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h b/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h -index 99f1b5e..20f66ff 100644 ---- a/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h -+++ b/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h -@@ -214,3 +214,66 @@ volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t* complexVector, const float* - - - #endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H */ -+ -+#ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H -+#define INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H -+ -+#include -+#include -+#include -+ -+#ifdef LV_HAVE_AVX2 -+#include -+ -+static inline void -+volk_32f_x2_s32f_interleave_16ic_u_avx2(lv_16sc_t* complexVector, const float* iBuffer, -+ const float* qBuffer, const float scalar, unsigned int num_points) -+{ -+ unsigned int number = 0; -+ const float* iBufferPtr = iBuffer; -+ const float* qBufferPtr = qBuffer; -+ -+ __m256 vScalar = _mm256_set1_ps(scalar); -+ -+ const unsigned int eighthPoints = num_points / 8; -+ -+ __m256 iValue, qValue, cplxValue1, cplxValue2; -+ __m256i intValue1, intValue2; -+ -+ int16_t* complexVectorPtr = (int16_t*)complexVector; -+ -+ for(;number < eighthPoints; number++){ -+ iValue = _mm256_loadu_ps(iBufferPtr); -+ qValue = _mm256_loadu_ps(qBufferPtr); -+ -+ // Interleaves the lower two values in the i and q variables into one buffer -+ cplxValue1 = _mm256_unpacklo_ps(iValue, qValue); -+ cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar); -+ -+ // Interleaves the upper two values in the i and q variables into one buffer -+ cplxValue2 = _mm256_unpackhi_ps(iValue, qValue); -+ cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar); -+ -+ intValue1 = _mm256_cvtps_epi32(cplxValue1); -+ intValue2 = _mm256_cvtps_epi32(cplxValue2); -+ -+ intValue1 = _mm256_packs_epi32(intValue1, intValue2); -+ -+ _mm256_storeu_si256((__m256i*)complexVectorPtr, intValue1); -+ complexVectorPtr += 16; -+ -+ iBufferPtr += 8; -+ qBufferPtr += 8; -+ } -+ -+ number = eighthPoints * 8; -+ complexVectorPtr = (int16_t*)(&complexVector[number]); -+ for(; number < num_points; number++){ -+ *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); -+ *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); -+ } -+} -+#endif /* LV_HAVE_AVX2 */ -+ -+ -+#endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H */ -diff --git a/kernels/volk/volk_32f_x2_subtract_32f.h b/kernels/volk/volk_32f_x2_subtract_32f.h -index 4a452fd..b7f36cf 100644 ---- a/kernels/volk/volk_32f_x2_subtract_32f.h -+++ b/kernels/volk/volk_32f_x2_subtract_32f.h -@@ -176,3 +176,48 @@ volk_32f_x2_subtract_32f_u_orc(float* cVector, const float* aVector, - - - #endif /* INCLUDED_volk_32f_x2_subtract_32f_a_H */ -+ -+ -+#ifndef INCLUDED_volk_32f_x2_subtract_32f_u_H -+#define INCLUDED_volk_32f_x2_subtract_32f_u_H -+ -+#include -+#include -+ -+#ifdef LV_HAVE_AVX -+#include -+ -+static inline void -+volk_32f_x2_subtract_32f_u_avx(float* cVector, const float* aVector, -+ const float* bVector, unsigned int num_points) -+{ -+ unsigned int number = 0; -+ const unsigned int eighthPoints = num_points / 8; -+ -+ float* cPtr = cVector; -+ const float* aPtr = aVector; -+ const float* bPtr = bVector; -+ -+ __m256 aVal, bVal, cVal; -+ for(;number < eighthPoints; number++){ -+ -+ aVal = _mm256_loadu_ps(aPtr); -+ bVal = _mm256_loadu_ps(bPtr); -+ -+ cVal = _mm256_sub_ps(aVal, bVal); -+ -+ _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container -+ -+ aPtr += 8; -+ bPtr += 8; -+ cPtr += 8; -+ } -+ -+ number = eighthPoints * 8; -+ for(;number < num_points; number++){ -+ *cPtr++ = (*aPtr++) - (*bPtr++); -+ } -+} -+#endif /* LV_HAVE_AVX */ -+ -+#endif /* INCLUDED_volk_32f_x2_subtract_32f_u_H */ --- -2.11.0 - diff -Nru volk-1.3/debian/patches/0016-kernels-Adds-AVX-support-to-volk_32f_-kernels.patch volk-1.4/debian/patches/0016-kernels-Adds-AVX-support-to-volk_32f_-kernels.patch --- volk-1.3/debian/patches/0016-kernels-Adds-AVX-support-to-volk_32f_-kernels.patch 2018-02-04 18:08:35.000000000 +0000 +++ volk-1.4/debian/patches/0016-kernels-Adds-AVX-support-to-volk_32f_-kernels.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,422 +0,0 @@ -From 940489f72b2c80f6b5dc514401773bf67a992f23 Mon Sep 17 00:00:00 2001 -From: Damian Miralles -Date: Fri, 15 Dec 2017 23:05:58 -0700 -Subject: [PATCH 16/18] kernels: Adds AVX support to `volk_32f_*` kernels - -Adds AVX support to `volk_32f_s32f_normalize`,`volk_32f_s32f_stddev_32f`, -`volk_32f_sqrt_32f`, `volk_32f_x2_max_32f` and `volk_32f_x2_min_32f`. -Some speed improvements can be seen with the new protokernel addition. ---- - kernels/volk/volk_32f_s32f_normalize.h | 74 ++++++++++++++++++++++++++++- - kernels/volk/volk_32f_s32f_stddev_32f.h | 59 +++++++++++++++++++++++ - kernels/volk/volk_32f_sqrt_32f.h | 33 +++++++++++++ - kernels/volk/volk_32f_x2_max_32f.h | 84 +++++++++++++++++++++++++++++++++ - kernels/volk/volk_32f_x2_min_32f.h | 84 +++++++++++++++++++++++++++++++++ - 5 files changed, 333 insertions(+), 1 deletion(-) - -diff --git a/kernels/volk/volk_32f_s32f_normalize.h b/kernels/volk/volk_32f_s32f_normalize.h -index 52bf006..17d9da9 100644 ---- a/kernels/volk/volk_32f_s32f_normalize.h -+++ b/kernels/volk/volk_32f_s32f_normalize.h -@@ -105,6 +105,39 @@ static inline void volk_32f_s32f_normalize_a_sse(float* vecBuffer, const float s - } - #endif /* LV_HAVE_SSE */ - -+ -+#ifdef LV_HAVE_AVX -+#include -+ -+static inline void volk_32f_s32f_normalize_a_avx(float* vecBuffer, const float scalar, unsigned int num_points){ -+ unsigned int number = 0; -+ float* inputPtr = vecBuffer; -+ -+ const float invScalar = 1.0 / scalar; -+ __m256 vecScalar = _mm256_set1_ps(invScalar); -+ __m256 input1; -+ -+ const uint64_t eigthPoints = num_points / 8; -+ for(;number < eigthPoints; number++){ -+ -+ input1 = _mm256_load_ps(inputPtr); -+ -+ input1 = _mm256_mul_ps(input1, vecScalar); -+ -+ _mm256_store_ps(inputPtr, input1); -+ -+ inputPtr += 8; -+ } -+ -+ number = eigthPoints*8; -+ for(; number < num_points; number++){ -+ *inputPtr *= invScalar; -+ inputPtr++; -+ } -+} -+#endif /* LV_HAVE_AVX */ -+ -+ - #ifdef LV_HAVE_GENERIC - - static inline void volk_32f_s32f_normalize_generic(float* vecBuffer, const float scalar, unsigned int num_points){ -@@ -128,6 +161,45 @@ static inline void volk_32f_s32f_normalize_u_orc(float* vecBuffer, const float s - #endif /* LV_HAVE_GENERIC */ - - -+#endif /* INCLUDED_volk_32f_s32f_normalize_a_H */ - - --#endif /* INCLUDED_volk_32f_s32f_normalize_a_H */ -+#ifndef INCLUDED_volk_32f_s32f_normalize_u_H -+#define INCLUDED_volk_32f_s32f_normalize_u_H -+ -+#include -+#include -+ -+#ifdef LV_HAVE_AVX -+#include -+ -+static inline void volk_32f_s32f_normalize_u_avx(float* vecBuffer, const float scalar, unsigned int num_points){ -+ unsigned int number = 0; -+ float* inputPtr = vecBuffer; -+ -+ const float invScalar = 1.0 / scalar; -+ __m256 vecScalar = _mm256_set1_ps(invScalar); -+ __m256 input1; -+ -+ const uint64_t eigthPoints = num_points / 8; -+ for(;number < eigthPoints; number++){ -+ -+ input1 = _mm256_loadu_ps(inputPtr); -+ -+ input1 = _mm256_mul_ps(input1, vecScalar); -+ -+ _mm256_storeu_ps(inputPtr, input1); -+ -+ inputPtr += 8; -+ } -+ -+ number = eigthPoints*8; -+ for(; number < num_points; number++){ -+ *inputPtr *= invScalar; -+ inputPtr++; -+ } -+} -+#endif /* LV_HAVE_AVX */ -+ -+ -+#endif /* INCLUDED_volk_32f_s32f_normalize_u_H */ -diff --git a/kernels/volk/volk_32f_s32f_stddev_32f.h b/kernels/volk/volk_32f_s32f_stddev_32f.h -index 30f0ed6..f97a783 100644 ---- a/kernels/volk/volk_32f_s32f_stddev_32f.h -+++ b/kernels/volk/volk_32f_s32f_stddev_32f.h -@@ -132,6 +132,65 @@ volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev, const float* inputBuffer, - #endif /* LV_HAVE_SSE4_1 */ - - -+#ifdef LV_HAVE_AVX -+#include -+ -+static inline void -+volk_32f_s32f_stddev_32f_a_avx(float* stddev, const float* inputBuffer, -+ const float mean, unsigned int num_points) -+{ -+ float returnValue = 0; -+ if(num_points > 0){ -+ unsigned int number = 0; -+ const unsigned int thirtySecondPoints = num_points / 32; -+ -+ const float* aPtr = inputBuffer; -+ -+ __VOLK_ATTR_ALIGNED(32) float squareBuffer[8]; -+ -+ __m256 squareAccumulator = _mm256_setzero_ps(); -+ __m256 aVal1, aVal2, aVal3, aVal4; -+ __m256 cVal1, cVal2, cVal3, cVal4; -+ for(;number < thirtySecondPoints; number++) { -+ aVal1 = _mm256_load_ps(aPtr); aPtr += 8; -+ cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1); -+ -+ aVal2 = _mm256_load_ps(aPtr); aPtr += 8; -+ cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2); -+ -+ aVal3 = _mm256_load_ps(aPtr); aPtr += 8; -+ cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4); -+ -+ aVal4 = _mm256_load_ps(aPtr); aPtr += 8; -+ cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8); -+ -+ cVal1 = _mm256_or_ps(cVal1, cVal2); -+ cVal3 = _mm256_or_ps(cVal3, cVal4); -+ cVal1 = _mm256_or_ps(cVal1, cVal3); -+ -+ squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 -+ } -+ _mm256_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container -+ returnValue = squareBuffer[0]; returnValue += squareBuffer[1]; -+ returnValue += squareBuffer[2]; returnValue += squareBuffer[3]; -+ returnValue += squareBuffer[4]; returnValue += squareBuffer[5]; -+ returnValue += squareBuffer[6]; returnValue += squareBuffer[7]; -+ -+ number = thirtySecondPoints * 32; -+ for(;number < num_points; number++){ -+ returnValue += (*aPtr) * (*aPtr); -+ aPtr++; -+ } -+ returnValue /= num_points; -+ returnValue -= (mean * mean); -+ returnValue = sqrtf(returnValue); -+ } -+ *stddev = returnValue; -+} -+ -+#endif /* LV_HAVE_AVX */ -+ -+ - #ifdef LV_HAVE_SSE - #include - -diff --git a/kernels/volk/volk_32f_sqrt_32f.h b/kernels/volk/volk_32f_sqrt_32f.h -index a5851a0..174f8e3 100644 ---- a/kernels/volk/volk_32f_sqrt_32f.h -+++ b/kernels/volk/volk_32f_sqrt_32f.h -@@ -102,6 +102,39 @@ volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_p - #endif /* LV_HAVE_SSE */ - - -+#ifdef LV_HAVE_AVX -+#include -+ -+static inline void -+volk_32f_sqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points) -+{ -+ unsigned int number = 0; -+ const unsigned int eigthPoints = num_points / 8; -+ -+ float* cPtr = cVector; -+ const float* aPtr = aVector; -+ -+ __m256 aVal, cVal; -+ for(;number < eigthPoints; number++) { -+ aVal = _mm256_load_ps(aPtr); -+ -+ cVal = _mm256_sqrt_ps(aVal); -+ -+ _mm256_store_ps(cPtr,cVal); // Store the results back into the C container -+ -+ aPtr += 8; -+ cPtr += 8; -+ } -+ -+ number = eigthPoints * 8; -+ for(;number < num_points; number++) { -+ *cPtr++ = sqrtf(*aPtr++); -+ } -+} -+ -+#endif /* LV_HAVE_AVX */ -+ -+ - #ifdef LV_HAVE_NEON - #include - -diff --git a/kernels/volk/volk_32f_x2_max_32f.h b/kernels/volk/volk_32f_x2_max_32f.h -index 14747c2..1dc0f7d 100644 ---- a/kernels/volk/volk_32f_x2_max_32f.h -+++ b/kernels/volk/volk_32f_x2_max_32f.h -@@ -112,6 +112,44 @@ volk_32f_x2_max_32f_a_sse(float* cVector, const float* aVector, - #endif /* LV_HAVE_SSE */ - - -+#ifdef LV_HAVE_AVX -+#include -+ -+static inline void -+volk_32f_x2_max_32f_a_avx(float* cVector, const float* aVector, -+ const float* bVector, unsigned int num_points) -+{ -+ unsigned int number = 0; -+ const unsigned int eigthPoints = num_points / 8; -+ -+ float* cPtr = cVector; -+ const float* aPtr = aVector; -+ const float* bPtr= bVector; -+ -+ __m256 aVal, bVal, cVal; -+ for(;number < eigthPoints; number++){ -+ aVal = _mm256_load_ps(aPtr); -+ bVal = _mm256_load_ps(bPtr); -+ -+ cVal = _mm256_max_ps(aVal, bVal); -+ -+ _mm256_store_ps(cPtr,cVal); // Store the results back into the C container -+ -+ aPtr += 8; -+ bPtr += 8; -+ cPtr += 8; -+ } -+ -+ number = eigthPoints * 8; -+ for(;number < num_points; number++){ -+ const float a = *aPtr++; -+ const float b = *bPtr++; -+ *cPtr++ = ( a > b ? a : b); -+ } -+} -+#endif /* LV_HAVE_AVX */ -+ -+ - #ifdef LV_HAVE_NEON - #include - -@@ -180,3 +218,49 @@ volk_32f_x2_max_32f_u_orc(float* cVector, const float* aVector, - - - #endif /* INCLUDED_volk_32f_x2_max_32f_a_H */ -+ -+ -+#ifndef INCLUDED_volk_32f_x2_max_32f_u_H -+#define INCLUDED_volk_32f_x2_max_32f_u_H -+ -+#include -+#include -+ -+#ifdef LV_HAVE_AVX -+#include -+ -+static inline void -+volk_32f_x2_max_32f_u_avx(float* cVector, const float* aVector, -+ const float* bVector, unsigned int num_points) -+{ -+ unsigned int number = 0; -+ const unsigned int eigthPoints = num_points / 8; -+ -+ float* cPtr = cVector; -+ const float* aPtr = aVector; -+ const float* bPtr= bVector; -+ -+ __m256 aVal, bVal, cVal; -+ for(;number < eigthPoints; number++){ -+ aVal = _mm256_loadu_ps(aPtr); -+ bVal = _mm256_loadu_ps(bPtr); -+ -+ cVal = _mm256_max_ps(aVal, bVal); -+ -+ _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container -+ -+ aPtr += 8; -+ bPtr += 8; -+ cPtr += 8; -+ } -+ -+ number = eigthPoints * 8; -+ for(;number < num_points; number++){ -+ const float a = *aPtr++; -+ const float b = *bPtr++; -+ *cPtr++ = ( a > b ? a : b); -+ } -+} -+#endif /* LV_HAVE_AVX */ -+ -+#endif /* INCLUDED_volk_32f_x2_max_32f_u_H */ -diff --git a/kernels/volk/volk_32f_x2_min_32f.h b/kernels/volk/volk_32f_x2_min_32f.h -index f3cbae1..3beb5fa 100644 ---- a/kernels/volk/volk_32f_x2_min_32f.h -+++ b/kernels/volk/volk_32f_x2_min_32f.h -@@ -112,6 +112,44 @@ volk_32f_x2_min_32f_a_sse(float* cVector, const float* aVector, - #endif /* LV_HAVE_SSE */ - - -+#ifdef LV_HAVE_AVX -+#include -+ -+static inline void -+volk_32f_x2_min_32f_a_avx(float* cVector, const float* aVector, -+ const float* bVector, unsigned int num_points) -+{ -+ unsigned int number = 0; -+ const unsigned int eigthPoints = num_points / 8; -+ -+ float* cPtr = cVector; -+ const float* aPtr = aVector; -+ const float* bPtr= bVector; -+ -+ __m256 aVal, bVal, cVal; -+ for(;number < eigthPoints; number++){ -+ aVal = _mm256_load_ps(aPtr); -+ bVal = _mm256_load_ps(bPtr); -+ -+ cVal = _mm256_min_ps(aVal, bVal); -+ -+ _mm256_store_ps(cPtr,cVal); // Store the results back into the C container -+ -+ aPtr += 8; -+ bPtr += 8; -+ cPtr += 8; -+ } -+ -+ number = eigthPoints * 8; -+ for(;number < num_points; number++){ -+ const float a = *aPtr++; -+ const float b = *bPtr++; -+ *cPtr++ = ( a < b ? a : b); -+ } -+} -+#endif /* LV_HAVE_AVX */ -+ -+ - #ifdef LV_HAVE_NEON - #include - -@@ -183,3 +221,49 @@ volk_32f_x2_min_32f_u_orc(float* cVector, const float* aVector, - - - #endif /* INCLUDED_volk_32f_x2_min_32f_a_H */ -+ -+ -+#ifndef INCLUDED_volk_32f_x2_min_32f_u_H -+#define INCLUDED_volk_32f_x2_min_32f_u_H -+ -+#include -+#include -+ -+#ifdef LV_HAVE_AVX -+#include -+ -+static inline void -+volk_32f_x2_min_32f_u_avx(float* cVector, const float* aVector, -+ const float* bVector, unsigned int num_points) -+{ -+ unsigned int number = 0; -+ const unsigned int eigthPoints = num_points / 8; -+ -+ float* cPtr = cVector; -+ const float* aPtr = aVector; -+ const float* bPtr= bVector; -+ -+ __m256 aVal, bVal, cVal; -+ for(;number < eigthPoints; number++){ -+ aVal = _mm256_loadu_ps(aPtr); -+ bVal = _mm256_loadu_ps(bPtr); -+ -+ cVal = _mm256_min_ps(aVal, bVal); -+ -+ _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container -+ -+ aPtr += 8; -+ bPtr += 8; -+ cPtr += 8; -+ } -+ -+ number = eigthPoints * 8; -+ for(;number < num_points; number++){ -+ const float a = *aPtr++; -+ const float b = *bPtr++; -+ *cPtr++ = ( a < b ? a : b); -+ } -+} -+#endif /* LV_HAVE_AVX */ -+ -+#endif /* INCLUDED_volk_32f_x2_min_32f_u_H */ --- -2.11.0 - diff -Nru volk-1.3/debian/patches/0017-kernels-Add-AVX-support-to-32f_x2_divide_32f-32f_x2_.patch volk-1.4/debian/patches/0017-kernels-Add-AVX-support-to-32f_x2_divide_32f-32f_x2_.patch --- volk-1.3/debian/patches/0017-kernels-Add-AVX-support-to-32f_x2_divide_32f-32f_x2_.patch 2018-02-04 18:08:35.000000000 +0000 +++ volk-1.4/debian/patches/0017-kernels-Add-AVX-support-to-32f_x2_divide_32f-32f_x2_.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,271 +0,0 @@ -From 0dd53d3ad8e24a833342b369743f274a15a66274 Mon Sep 17 00:00:00 2001 -From: Damian Miralles -Date: Wed, 20 Dec 2017 21:01:52 -0700 -Subject: [PATCH 17/18] kernels: Add AVX support to - `32f_x2_divide_32f`,`32f_x2_dot_prod_16i` - -Adds protokernels for AVX support. Modest speed improvements in some of -the kernels, however, it seems to be related to the host architecture -being used ---- - kernels/volk/volk_32f_x2_divide_32f.h | 80 +++++++++++++++++ - kernels/volk/volk_32f_x2_dot_prod_16i.h | 148 ++++++++++++++++++++++++++++++++ - 2 files changed, 228 insertions(+) - -diff --git a/kernels/volk/volk_32f_x2_divide_32f.h b/kernels/volk/volk_32f_x2_divide_32f.h -index d724173..7cc34ca 100644 ---- a/kernels/volk/volk_32f_x2_divide_32f.h -+++ b/kernels/volk/volk_32f_x2_divide_32f.h -@@ -110,6 +110,42 @@ volk_32f_x2_divide_32f_a_sse(float* cVector, const float* aVector, - #endif /* LV_HAVE_SSE */ - - -+#ifdef LV_HAVE_AVX -+#include -+ -+static inline void -+volk_32f_x2_divide_32f_a_avx(float* cVector, const float* aVector, -+ const float* bVector, unsigned int num_points) -+{ -+ unsigned int number = 0; -+ const unsigned int eigthPoints = num_points / 8; -+ -+ float* cPtr = cVector; -+ const float* aPtr = aVector; -+ const float* bPtr= bVector; -+ -+ __m256 aVal, bVal, cVal; -+ for(;number < eigthPoints; number++){ -+ aVal = _mm256_load_ps(aPtr); -+ bVal = _mm256_load_ps(bPtr); -+ -+ cVal = _mm256_div_ps(aVal, bVal); -+ -+ _mm256_store_ps(cPtr,cVal); // Store the results back into the C container -+ -+ aPtr += 8; -+ bPtr += 8; -+ cPtr += 8; -+ } -+ -+ number = eigthPoints * 8; -+ for(;number < num_points; number++){ -+ *cPtr++ = (*aPtr++) / (*bPtr++); -+ } -+} -+#endif /* LV_HAVE_AVX */ -+ -+ - #ifdef LV_HAVE_GENERIC - - static inline void -@@ -145,3 +181,47 @@ volk_32f_x2_divide_32f_u_orc(float* cVector, const float* aVector, - - - #endif /* INCLUDED_volk_32f_x2_divide_32f_a_H */ -+ -+ -+#ifndef INCLUDED_volk_32f_x2_divide_32f_u_H -+#define INCLUDED_volk_32f_x2_divide_32f_u_H -+ -+#include -+#include -+ -+#ifdef LV_HAVE_AVX -+#include -+ -+static inline void -+volk_32f_x2_divide_32f_u_avx(float* cVector, const float* aVector, -+ const float* bVector, unsigned int num_points) -+{ -+ unsigned int number = 0; -+ const unsigned int eigthPoints = num_points / 8; -+ -+ float* cPtr = cVector; -+ const float* aPtr = aVector; -+ const float* bPtr= bVector; -+ -+ __m256 aVal, bVal, cVal; -+ for(;number < eigthPoints; number++){ -+ aVal = _mm256_loadu_ps(aPtr); -+ bVal = _mm256_loadu_ps(bPtr); -+ -+ cVal = _mm256_div_ps(aVal, bVal); -+ -+ _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container -+ -+ aPtr += 8; -+ bPtr += 8; -+ cPtr += 8; -+ } -+ -+ number = eigthPoints * 8; -+ for(;number < num_points; number++){ -+ *cPtr++ = (*aPtr++) / (*bPtr++); -+ } -+} -+#endif /* LV_HAVE_AVX */ -+ -+#endif /* INCLUDED_volk_32f_x2_divide_32f_u_H */ -diff --git a/kernels/volk/volk_32f_x2_dot_prod_16i.h b/kernels/volk/volk_32f_x2_dot_prod_16i.h -index 15f01b7..a1279cf 100644 ---- a/kernels/volk/volk_32f_x2_dot_prod_16i.h -+++ b/kernels/volk/volk_32f_x2_dot_prod_16i.h -@@ -82,6 +82,154 @@ static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result, const float - #endif /*LV_HAVE_GENERIC*/ - - -+#ifdef LV_HAVE_AVX -+ -+static inline void volk_32f_x2_dot_prod_16i_a_avx(int16_t* result, const float* input, const float* taps, unsigned int num_points) { -+ -+ unsigned int number = 0; -+ const unsigned int thirtySecondPoints = num_points / 32; -+ -+ float dotProduct = 0; -+ const float* aPtr = input; -+ const float* bPtr = taps; -+ -+ __m256 a0Val, a1Val, a2Val, a3Val; -+ __m256 b0Val, b1Val, b2Val, b3Val; -+ __m256 c0Val, c1Val, c2Val, c3Val; -+ -+ __m256 dotProdVal0 = _mm256_setzero_ps(); -+ __m256 dotProdVal1 = _mm256_setzero_ps(); -+ __m256 dotProdVal2 = _mm256_setzero_ps(); -+ __m256 dotProdVal3 = _mm256_setzero_ps(); -+ -+ for(;number < thirtySecondPoints; number++){ -+ -+ a0Val = _mm256_load_ps(aPtr); -+ a1Val = _mm256_load_ps(aPtr+8); -+ a2Val = _mm256_load_ps(aPtr+16); -+ a3Val = _mm256_load_ps(aPtr+24); -+ -+ b0Val = _mm256_load_ps(bPtr); -+ b1Val = _mm256_load_ps(bPtr+8); -+ b2Val = _mm256_load_ps(bPtr+16); -+ b3Val = _mm256_load_ps(bPtr+24); -+ -+ c0Val = _mm256_mul_ps(a0Val, b0Val); -+ c1Val = _mm256_mul_ps(a1Val, b1Val); -+ c2Val = _mm256_mul_ps(a2Val, b2Val); -+ c3Val = _mm256_mul_ps(a3Val, b3Val); -+ -+ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); -+ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); -+ dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); -+ dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); -+ -+ aPtr += 32; -+ bPtr += 32; -+ } -+ -+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); -+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); -+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); -+ -+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; -+ -+ _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector -+ -+ dotProduct = dotProductVector[0]; -+ dotProduct += dotProductVector[1]; -+ dotProduct += dotProductVector[2]; -+ dotProduct += dotProductVector[3]; -+ dotProduct += dotProductVector[4]; -+ dotProduct += dotProductVector[5]; -+ dotProduct += dotProductVector[6]; -+ dotProduct += dotProductVector[7]; -+ -+ number = thirtySecondPoints*32; -+ for(;number < num_points; number++){ -+ dotProduct += ((*aPtr++) * (*bPtr++)); -+ } -+ -+ *result = (short)dotProduct; -+} -+ -+#endif /*LV_HAVE_AVX*/ -+ -+ -+#ifdef LV_HAVE_AVX -+ -+static inline void volk_32f_x2_dot_prod_16i_u_avx(int16_t* result, const float* input, const float* taps, unsigned int num_points) { -+ -+ unsigned int number = 0; -+ const unsigned int thirtySecondPoints = num_points / 32; -+ -+ float dotProduct = 0; -+ const float* aPtr = input; -+ const float* bPtr = taps; -+ -+ __m256 a0Val, a1Val, a2Val, a3Val; -+ __m256 b0Val, b1Val, b2Val, b3Val; -+ __m256 c0Val, c1Val, c2Val, c3Val; -+ -+ __m256 dotProdVal0 = _mm256_setzero_ps(); -+ __m256 dotProdVal1 = _mm256_setzero_ps(); -+ __m256 dotProdVal2 = _mm256_setzero_ps(); -+ __m256 dotProdVal3 = _mm256_setzero_ps(); -+ -+ for(;number < thirtySecondPoints; number++){ -+ -+ a0Val = _mm256_loadu_ps(aPtr); -+ a1Val = _mm256_loadu_ps(aPtr+8); -+ a2Val = _mm256_loadu_ps(aPtr+16); -+ a3Val = _mm256_loadu_ps(aPtr+24); -+ -+ b0Val = _mm256_loadu_ps(bPtr); -+ b1Val = _mm256_loadu_ps(bPtr+8); -+ b2Val = _mm256_loadu_ps(bPtr+16); -+ b3Val = _mm256_loadu_ps(bPtr+24); -+ -+ c0Val = _mm256_mul_ps(a0Val, b0Val); -+ c1Val = _mm256_mul_ps(a1Val, b1Val); -+ c2Val = _mm256_mul_ps(a2Val, b2Val); -+ c3Val = _mm256_mul_ps(a3Val, b3Val); -+ -+ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); -+ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); -+ dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); -+ dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); -+ -+ aPtr += 32; -+ bPtr += 32; -+ } -+ -+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); -+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); -+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); -+ -+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; -+ -+ _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector -+ -+ dotProduct = dotProductVector[0]; -+ dotProduct += dotProductVector[1]; -+ dotProduct += dotProductVector[2]; -+ dotProduct += dotProductVector[3]; -+ dotProduct += dotProductVector[4]; -+ dotProduct += dotProductVector[5]; -+ dotProduct += dotProductVector[6]; -+ dotProduct += dotProductVector[7]; -+ -+ number = thirtySecondPoints*32; -+ for(;number < num_points; number++){ -+ dotProduct += ((*aPtr++) * (*bPtr++)); -+ } -+ -+ *result = (short)dotProduct; -+} -+ -+#endif /*LV_HAVE_AVX*/ -+ -+ - #ifdef LV_HAVE_SSE - - static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, const float* input, const float* taps, unsigned int num_points) { --- -2.11.0 - diff -Nru volk-1.3/debian/patches/0018-fix-GH-issue-139-for-32fc_index_max_-kernels.patch volk-1.4/debian/patches/0018-fix-GH-issue-139-for-32fc_index_max_-kernels.patch --- volk-1.3/debian/patches/0018-fix-GH-issue-139-for-32fc_index_max_-kernels.patch 2018-02-04 18:08:35.000000000 +0000 +++ volk-1.4/debian/patches/0018-fix-GH-issue-139-for-32fc_index_max_-kernels.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,42 +0,0 @@ -From 0109b2ed06f907363d3ea5a05d24db4992e2d1a5 Mon Sep 17 00:00:00 2001 -From: Nathan West -Date: Tue, 23 Jan 2018 12:02:03 -0500 -Subject: [PATCH 18/18] fix GH issue #139 for 32fc_index_max_* kernels - ---- - kernels/volk/volk_32fc_index_max_16u.h | 3 +-- - kernels/volk/volk_32fc_index_max_32u.h | 2 +- - 2 files changed, 2 insertions(+), 3 deletions(-) - -diff --git a/kernels/volk/volk_32fc_index_max_16u.h b/kernels/volk/volk_32fc_index_max_16u.h -index c13196a..14b0d22 100644 ---- a/kernels/volk/volk_32fc_index_max_16u.h -+++ b/kernels/volk/volk_32fc_index_max_16u.h -@@ -115,10 +115,9 @@ volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, - int i = 0; - - xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order! -- xmm9 = xmm8 = _mm_setzero_si128(); -+ xmm9 = _mm_setzero_si128(); - xmm10 = _mm_set_epi32(4, 4, 4, 4); - xmm3 = _mm_setzero_ps(); -- - //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]); - - for(; i < bound; ++i) { -diff --git a/kernels/volk/volk_32fc_index_max_32u.h b/kernels/volk/volk_32fc_index_max_32u.h -index ad794fb..5665582 100644 ---- a/kernels/volk/volk_32fc_index_max_32u.h -+++ b/kernels/volk/volk_32fc_index_max_32u.h -@@ -104,7 +104,7 @@ volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, - int i = 0; - - xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order! -- xmm9 = xmm8 = _mm_setzero_si128(); -+ xmm9 = _mm_setzero_si128(); - xmm10 = _mm_set_epi32(4, 4, 4, 4); - xmm3 = _mm_setzero_ps(); - --- -2.11.0 - diff -Nru volk-1.3/debian/patches/install-all-headers volk-1.4/debian/patches/install-all-headers --- volk-1.3/debian/patches/install-all-headers 2016-07-02 22:57:34.000000000 +0000 +++ volk-1.4/debian/patches/install-all-headers 1970-01-01 00:00:00.000000000 +0000 @@ -1,33 +0,0 @@ -From: A. Maitland Bottoms -Subject: install all headers - -(Along with some sorting) - ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -158,17 +158,20 @@ - ) - - install(FILES -- ${CMAKE_SOURCE_DIR}/include/volk/volk_prefs.h -- ${CMAKE_SOURCE_DIR}/include/volk/volk_complex.h -- ${CMAKE_SOURCE_DIR}/include/volk/volk_common.h -+ ${CMAKE_SOURCE_DIR}/include/volk/constants.h -+ ${CMAKE_SOURCE_DIR}/include/volk/saturation_arithmetic.h - ${CMAKE_SOURCE_DIR}/include/volk/volk_avx_intrinsics.h -- ${CMAKE_SOURCE_DIR}/include/volk/volk_sse3_intrinsics.h -+ ${CMAKE_SOURCE_DIR}/include/volk/volk_common.h -+ ${CMAKE_SOURCE_DIR}/include/volk/volk_complex.h -+ ${CMAKE_SOURCE_DIR}/include/volk/volk_malloc.h - ${CMAKE_SOURCE_DIR}/include/volk/volk_neon_intrinsics.h -+ ${CMAKE_SOURCE_DIR}/include/volk/volk_prefs.h -+ ${CMAKE_SOURCE_DIR}/include/volk/volk_sse3_intrinsics.h -+ ${CMAKE_SOURCE_DIR}/include/volk/volk_sse_intrinsics.h - ${CMAKE_BINARY_DIR}/include/volk/volk.h - ${CMAKE_BINARY_DIR}/include/volk/volk_cpu.h - ${CMAKE_BINARY_DIR}/include/volk/volk_config_fixed.h - ${CMAKE_BINARY_DIR}/include/volk/volk_typedefs.h -- ${CMAKE_SOURCE_DIR}/include/volk/volk_malloc.h - DESTINATION include/volk - COMPONENT "volk_devel" - ) diff -Nru volk-1.3/debian/patches/libm-link volk-1.4/debian/patches/libm-link --- volk-1.3/debian/patches/libm-link 2017-08-27 19:28:28.000000000 +0000 +++ volk-1.4/debian/patches/libm-link 2018-03-28 03:16:17.000000000 +0000 @@ -1,11 +1,20 @@ --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt -@@ -544,7 +544,7 @@ +@@ -542,7 +542,7 @@ #Add dynamic library add_library(volk SHARED $) - target_link_libraries(volk ${volk_libraries}) + target_link_libraries(volk ${volk_libraries} m) - - #Configure target properties - set_target_properties(volk_obj PROPERTIES COMPILE_FLAGS "-fPIC") + target_include_directories(volk + PUBLIC ${PROJECT_BINARY_DIR}/include + PUBLIC ${PROJECT_SOURCE_DIR}/include +@@ -584,7 +584,7 @@ + else() + #create the volk runtime library + add_library(volk SHARED ${volk_sources}) +- target_link_libraries(volk ${volk_libraries}) ++ target_link_libraries(volk ${volk_libraries} m) + include_directories(volk + PUBLIC ${PROJECT_BINARY_DIR}/include + PUBLIC ${PROJECT_SOURCE_DIR}/include diff -Nru volk-1.3/debian/patches/make-acc-happy volk-1.4/debian/patches/make-acc-happy --- volk-1.3/debian/patches/make-acc-happy 2016-07-02 22:51:23.000000000 +0000 +++ volk-1.4/debian/patches/make-acc-happy 2018-03-28 02:11:35.000000000 +0000 @@ -1,21 +1,20 @@ -From: A. Maitland Bottoms -Subject: make acc happy +From 799245ea6e9e05cc0ed0fabe783fbbe1a5054fd4 Mon Sep 17 00:00:00 2001 +From: "A. Maitland Bottoms" +Date: Tue, 27 Mar 2018 22:02:59 -0400 +Subject: [PATCH 2/6] make acc happy The abi-compliance-checker grabs all the .h files it finds and tries to compile them all. Even though some are not appropriate for the architecture being run on. Being careful -with preprocessor protections avoids probplems. +with preprocessor protections avoids problems. +--- + include/volk/volk_neon_intrinsics.h | 2 ++ + kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h | 1 + + kernels/volk/volk_8u_x2_encodeframepolar_8u.h | 3 --- + 3 files changed, 3 insertions(+), 3 deletions(-) ---- a/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h -+++ b/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h -@@ -31,6 +31,7 @@ - #include - #include - #include -+#include - - - static inline void +diff --git a/include/volk/volk_neon_intrinsics.h b/include/volk/volk_neon_intrinsics.h +index 3f121a8..ef42dea 100644 --- a/include/volk/volk_neon_intrinsics.h +++ b/include/volk/volk_neon_intrinsics.h @@ -27,6 +27,7 @@ @@ -26,15 +25,29 @@ #include static inline float32x4_t -@@ -119,4 +120,5 @@ +@@ -119,4 +120,5 @@ _vlog2q_f32(float32x4_t aval) return log2_approx; } +#endif /*LV_HAVE_NEON*/ #endif /* INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_ */ +diff --git a/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h b/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h +index 49a3076..4f84352 100644 +--- a/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h ++++ b/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h +@@ -31,6 +31,7 @@ + #include + #include + #include ++#include + + + static inline void +diff --git a/kernels/volk/volk_8u_x2_encodeframepolar_8u.h b/kernels/volk/volk_8u_x2_encodeframepolar_8u.h +index 6a605c6..ff729a8 100644 --- a/kernels/volk/volk_8u_x2_encodeframepolar_8u.h +++ b/kernels/volk/volk_8u_x2_encodeframepolar_8u.h -@@ -58,8 +58,6 @@ +@@ -58,8 +58,6 @@ encodepolar_single_stage(unsigned char* frame_ptr, const unsigned char* temp_ptr } } @@ -43,7 +56,7 @@ static inline void volk_8u_x2_encodeframepolar_8u_generic(unsigned char* frame, unsigned char* temp, unsigned int frame_size) -@@ -79,7 +77,6 @@ +@@ -79,7 +77,6 @@ volk_8u_x2_encodeframepolar_8u_generic(unsigned char* frame, unsigned char* temp --stage; } } @@ -51,3 +64,6 @@ #ifdef LV_HAVE_SSSE3 #include +-- +2.11.0 + diff -Nru volk-1.3/debian/patches/native-armv7-build-support volk-1.4/debian/patches/native-armv7-build-support --- volk-1.3/debian/patches/native-armv7-build-support 2016-07-02 22:56:58.000000000 +0000 +++ volk-1.4/debian/patches/native-armv7-build-support 2018-03-28 03:00:30.000000000 +0000 @@ -1,9 +1,14 @@ -From: A. Maitland Bottoms -Subject: native armv7 build support +From b554121e765a3495e23975112f269a8083950212 Mon Sep 17 00:00:00 2001 +From: "A. Maitland Bottoms" +Date: Tue, 27 Mar 2018 22:01:33 -0400 +Subject: [PATCH 1/6] native armv7 build support Debian, unlike other GNU Radio deployments, does not cross-compile packages, but builds natively on a set of build machines, including both arm and armhf. +--- + lib/CMakeLists.txt | 15 +++++++++++---- + 1 file changed, 11 insertions(+), 4 deletions(-) --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -21,7 +26,7 @@ # implement overruling in the ORC case, # since ORC always passes flag detection ######################################################################## -@@ -414,7 +421,7 @@ +@@ -404,7 +411,7 @@ # Handle ASM support # on by default, but let users turn it off ######################################################################## @@ -30,7 +35,7 @@ set(ASM_ARCHS_AVAILABLE "neon") set(FULL_C_FLAGS "${CMAKE_C_FLAGS}" "${CMAKE_CXX_COMPILER_ARG1}") -@@ -424,7 +431,7 @@ +@@ -414,7 +421,7 @@ # set up the assembler flags and include the source files foreach(ARCH ${ASM_ARCHS_AVAILABLE}) string(REGEX MATCH "${ARCH}" ASM_ARCH "${available_archs}") @@ -39,7 +44,7 @@ message(STATUS "---- Adding ASM files") # we always use ATT syntax message(STATUS "-- Detected neon architecture; enabling ASM") # setup architecture specific assembler flags -@@ -443,7 +450,7 @@ +@@ -433,7 +440,7 @@ message(STATUS "asm flags: ${CMAKE_ASM_FLAGS}") endforeach(ARCH) @@ -48,7 +53,7 @@ message(STATUS "Not enabling ASM support. CMake >= 2.8.10 required.") foreach(machine_name ${available_machines}) string(REGEX MATCH "neon" NEON_MACHINE ${machine_name}) -@@ -451,7 +458,7 @@ +@@ -441,7 +448,7 @@ message(FATAL_ERROR "CMake >= 2.8.10 is required for ARM NEON support") endif() endforeach() diff -Nru volk-1.3/debian/patches/series volk-1.4/debian/patches/series --- volk-1.3/debian/patches/series 2018-02-04 18:12:21.000000000 +0000 +++ volk-1.4/debian/patches/series 2018-05-12 19:23:50.000000000 +0000 @@ -1,22 +1,12 @@ -0001-Add-a-AppVeyor-compatible-YAML-file-for-building-on-.patch -0003-apps-fix-profile-update-reading-end-of-lines.patch -0005-qa-lower-tolerance-for-32fc_mag-to-fix-issue-96.patch -0006-Add-NEON-AVX-and-unaligned-versions-of-SSE4.1-and-SS.patch -0007-added-__VOLK_PREFETCH-compatibility-macro.patch -0008-Fix-bug-106-volk_64u_popcnt-bug-in-generic-implement.patch -0009-modtool-deconflict-module-include-guards-from-main-v.patch -0010-modtool-update-the-cmake-find-module-for-volk-mods.patch -0011-Use-powf-to-match-variables-and-avoid-implicit-type-.patch -0012-cmake-support-empty-CMAKE_INSTALL_PREFIX.patch -0013-Support-relocated-install-with-VOLK_PREFIX-env-var.patch -0014-Fixing-a-minimal-bug-in-the-log2-docstring.patch -0015-kernel-Adds-unaligned-protokernles-to-32f_x2_s32f_in.patch -0016-kernels-Adds-AVX-support-to-volk_32f_-kernels.patch -0017-kernels-Add-AVX-support-to-32f_x2_divide_32f-32f_x2_.patch -0018-fix-GH-issue-139-for-32fc_index_max_-kernels.patch +0002-Added-an-AVX-protokernel-for-volk_32fc_x2_32f_square.patch +0003-extracted-variables-for-the-source-real-and-imaginar.patch +0004-fixed-a-buffer-over-read-and-over-write-in-volk_32fc.patch +0005-cmake-Fix-endif-to-match-if.patch +0006-Add-sys-time.h-header-copied-from-gnuradio-to-fix-Wi.patch +0007-Fix-compile-on-Windows-by-avoiding-min-max-macros-in.patch +0008-Fix-add_test-for-Windows.patch +0009-Fix-32u_reverse_32u-for-ARM.patch native-armv7-build-support make-acc-happy sort-cmake-glob-lists -install-all-headers -sort-input-files.patch libm-link diff -Nru volk-1.3/debian/patches/sort-cmake-glob-lists volk-1.4/debian/patches/sort-cmake-glob-lists --- volk-1.3/debian/patches/sort-cmake-glob-lists 2016-07-02 21:44:05.000000000 +0000 +++ volk-1.4/debian/patches/sort-cmake-glob-lists 2018-03-28 03:14:41.000000000 +0000 @@ -1,8 +1,13 @@ -From: A. Maitland Bottoms -Subject sort cmake glob lists +From 9d32c341220aeb5a07011b7ef349f8c606941ee4 Mon Sep 17 00:00:00 2001 +From: "A. Maitland Bottoms" +Date: Tue, 27 Mar 2018 22:04:11 -0400 +Subject: [PATCH 3/6] sort cmake glob lists File lists are generated in a CMakeLists.txt file with file(GLOB ...), which varies -with the readdir() order. Sorting the lists should help make reproducinble builds. +with the readdir() order. Sorting the lists should help make reproducible builds. +--- + lib/CMakeLists.txt | 3 +++ + 1 file changed, 3 insertions(+) --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt diff -Nru volk-1.3/debian/patches/sort-input-files.patch volk-1.4/debian/patches/sort-input-files.patch --- volk-1.3/debian/patches/sort-input-files.patch 2017-08-27 17:58:00.000000000 +0000 +++ volk-1.4/debian/patches/sort-input-files.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,51 +0,0 @@ -From f6dbb5f8ba840075dde9f0aa1cc48b805ea4d1c5 Mon Sep 17 00:00:00 2001 -From: "Bernhard M. Wiedemann" -Date: Mon, 5 Jun 2017 21:37:38 +0200 -Subject: [PATCH 2/5] sort input files - -when building packages (e.g. for openSUSE Linux) -(random) filesystem order of input files -influences ordering of entries in the output, -thus without the patch, builds (in disposable VMs) would usually differ. - -See https://reproducible-builds.org/ for why this matters. ---- - gen/volk_kernel_defs.py | 2 +- - python/volk_modtool/volk_modtool_generate.py | 6 +++--- - 2 files changed, 4 insertions(+), 4 deletions(-) - ---- a/gen/volk_kernel_defs.py -+++ b/gen/volk_kernel_defs.py -@@ -202,7 +202,7 @@ - ######################################################################## - __file__ = os.path.abspath(__file__) - srcdir = os.path.dirname(os.path.dirname(__file__)) --kernel_files = glob.glob(os.path.join(srcdir, "kernels", "volk", "*.h")) -+kernel_files = sorted(glob.glob(os.path.join(srcdir, "kernels", "volk", "*.h"))) - kernels = map(kernel_class, kernel_files) - - if __name__ == '__main__': ---- a/python/volk_modtool/volk_modtool_generate.py -+++ b/python/volk_modtool/volk_modtool_generate.py -@@ -58,10 +58,10 @@ - else: - name = self.get_basename(base) - if name == '': -- hdr_files = glob.glob(os.path.join(base, "kernels/volk/*.h")) -+ hdr_files = sorted(glob.glob(os.path.join(base, "kernels/volk/*.h"))) - begins = re.compile("(?<=volk_).*") - else: -- hdr_files = glob.glob(os.path.join(base, "kernels/volk_" + name + "/*.h")) -+ hdr_files = sorted(glob.glob(os.path.join(base, "kernels/volk_" + name + "/*.h"))) - begins = re.compile("(?<=volk_" + name + "_).*") - - datatypes = [] -@@ -156,7 +156,7 @@ - open(dest, 'w+').write(outstring) - - # copy orc proto-kernels if they exist -- for orcfile in glob.glob(inpath + '/kernels/volk/asm/orc/' + top + name + '*.orc'): -+ for orcfile in sorted(glob.glob(inpath + '/kernels/volk/asm/orc/' + top + name + '*.orc')): - if os.path.isfile(orcfile): - instring = open(orcfile, 'r').read() - outstring = re.sub(oldvolk, 'volk_' + self.my_dict['name'], instring) diff -Nru volk-1.3/debian/release-v14.html volk-1.4/debian/release-v14.html --- volk-1.3/debian/release-v14.html 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/debian/release-v14.html 2018-03-27 00:41:55.000000000 +0000 @@ -0,0 +1,201 @@ + + + + + Vector Optimized Library of Kernels + + + + + + + +

+ +

+ + Vector-Optimized Library of Kernels + : +Release v1.4 +

Information

Source Code

Community

+ +

VOLK v1.4

A lot of really good changes came to VOLK with v1.4. It wouldn't have been possible without the following contributors:

Contributors

Andrej Rode mail@andrejro.de
Bernhard M. Wiedemann bwiedemann@suse.de
Carles Fernandez carles.fernandez@gmail.com
Christoph Mayer Christoph.Mayer@cern.ch
Damian Miralles damian.miralles@colorado.edu
Douglas Anderson douglas.j.anderson@gmail.com
hcab14 hcab14@gmail.com
Johannes Demel ufcsy@student.kit.edu
Josh Blum josh@joshknows.com
luz.paz luzpaz@users.noreply.github.com
Magnus Lundmark magnus@skysense.io
Marcus Müller marcus@hostalia.de
Michael Dickens michael.dickens@ettus.com
Nathan West nwest@deepsig.io
Nick Foster bistromath@gmail.com
Paul Cercueil paul.cercueil@analog.com
Stefan Wunsch stefan.wunsch@student.kit.edu

Changes

Generally, there are a lot of kernel changes and some minor dependency changes. I'm trying to remove boost as a dependency and we've introduced mako templates rather than the old Cheetah-templates to keep in line with GNU Radio. There are also several new CI files that support appveyor, travis-ci, and gitlab. Right now all pull requests must pass travis-ci.

Kernels

The easiest way to show these changes is simply with two lists:

New kernels

32 bit reversal
32f_s32f_s32f_mod_range_32f
double precision (64f_XXX...)
- multiply
- add
+
32f_64f_multiply_64f
add 32f_64f_add_64f
32fc_x2_add_32fc

New proto-kernels by architecture

AVX(2):

Note that in some cases an unaligned version was added where an aligned version already existed

volk_64f_convert_32f
volk_64f_x2_max_64f
volk_64f_x2_min_64f
volk_32f_x2_add_32f
32i_x2_and_32i
32i_x2_or_32i
conjugate dot products
32f_accumulator_32f
stddev_and_mean
volk_32f_* kernels
32f_x2_divide_32f
32f_x2_dot_prod_16i
volk_32f_s32f_normalize
volk_32f_s32f_stddev_32f
volk_32f_sqrt_32f
volk_32f_x2_max_32f
volk_32f_x2_min_32f
32f_x2_s32f_interleave_16ic
32f_x2_subtract_32f
32f_x2_s32f_interleave_16ic
32f_x2_subtract_32f
32f_x2_subtract_32f
32f_x2_s32f_interleave_16ic
volk_8ic_s32f_deinterleave_*
32f_log2_32f
volk_32f_s32f_convert_8i and 16i

NEON:

move all neonasm to aligned protokernels
added ARM version of volk_32u_reverse_32u (RBIT)
volk_32fc_x2_divide_32fc
volk_32fc_32f_add_32fc
volk_32f_x2_divide_32f
volk_8i_s32f_convert_32f

Additionally, there are new protokernel intrinsics available for use in writing new kernels.

Then, we also had some general kernel and protokernel bug fixes and using proper type-named C functions which happened to increase performance:

The polarbutterfly went through some heavy refactoring and bug fixes as well as adding an AVX version. +Fix GH issue #139 for 32fc_index_max_* kernels resulting in a slightly wrong index being returned. +Fix bug 106 (volk_64u_popcnt bug in generic implementation)

CI and Builds

As previously mentioned there are appveyor, travis-ci, and gitlab CI files available. There is a travis-ci instance checking all pull requests at https://travis-ci.org/gnuradio/volk/ and a gitlab mirror running CI checks at https://gitlab.com/n-west/volk.

While working on these CI files the kernel tests were split in to individual ctest targets so that each kernel is its own test rather than running them as a monolithic binary. This allows parallel testing, but mostly enables easier diagnostics when a test fails. The readme is now a markdown file that renders well on GitHub and Gitlab along with the travis-ci status as a badge.

Within this release two tools were run that reorganized includes and fixed a bunch of typos within code.

As part of the attempt to build VOLK without boost a bunch of app and build utilities were written to replace boost-code. This shouldnt be visible to the user, but will hopefully make future builds easier and smaller with fewer build and run-time dependencies. Builds with python 2.7 and 3 should work-- although six is required for python2.7 support.

Some build changes make it easier to do a relocatable build and order all files before building so that building from a particular revision (from now on) should be reproducible across machines building the same architectures. To use a relocatable install use the VOLK_PREFIX environment variable. This should support snaps (Canonical packaging environment).

Modtool

modtool: update the cmake find module for volk mods
+modtool: deconflict module include guards from main volk
+

+ +

+ + + + + + + + + + + + + \ No newline at end of file diff -Nru volk-1.3/docs/terms_and_techniques.dox volk-1.4/docs/terms_and_techniques.dox --- volk-1.3/docs/terms_and_techniques.dox 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/docs/terms_and_techniques.dox 2018-03-26 22:52:55.000000000 +0000 @@ -42,7 +42,7 @@ VOLK's QA tool as well as the volk profiler. Many kernels are able to share test parameters, but new kernels might need new ones. -If the VOLK kernel does not 'fit' the the standard set of function parameters +If the VOLK kernel does not 'fit' the standard set of function parameters expected by the volk_profile structure, you need to create a VOLK puppet function to help the profiler call the kernel. This is essentially due to the function run_volk_tests which has a limited number of function prototypes that diff -Nru volk-1.3/Doxyfile.in volk-1.4/Doxyfile.in --- volk-1.3/Doxyfile.in 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/Doxyfile.in 2018-03-26 22:52:55.000000000 +0000 @@ -1117,7 +1117,7 @@ # defined cascading style sheet that is included after the standard style sheets # created by doxygen. Using this option one can overrule certain style aspects. # This is preferred over using HTML_STYLESHEET since it does not replace the -# standard style sheet and is therefor more robust against future updates. +# standard style sheet and is therefore more robust against future updates. # Doxygen will copy the style sheet file to the output directory. For an example # see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. @@ -2008,7 +2008,7 @@ EXPAND_AS_DEFINED = # If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will -# remove all refrences to function-like macros that are alone on a line, have an +# remove all references to function-like macros that are alone on a line, have an # all uppercase name, and do not end with a semicolon. Such function macros are # typically used for boiler-plate code, and will confuse the parser if not # removed. diff -Nru volk-1.3/gen/volk_arch_defs.py volk-1.4/gen/volk_arch_defs.py --- volk-1.3/gen/volk_arch_defs.py 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/gen/volk_arch_defs.py 2018-03-26 22:52:55.000000000 +0000 @@ -15,6 +15,10 @@ # along with this program. If not, see . # +from __future__ import print_function + +import six + archs = list() arch_dict = dict() @@ -75,11 +79,11 @@ flags = dict() for flag_xml in arch_xml.getElementsByTagName("flag"): name = flag_xml.attributes["compiler"].value - if not flags.has_key(name): flags[name] = list() + if name not in flags: flags[name] = list() flags[name].append(flag_xml.firstChild.data) #force kwargs keys to be of type str, not unicode for py25 - kwargs = dict((str(k), v) for k, v in kwargs.iteritems()) + kwargs = dict((str(k), v) for k, v in six.iteritems(kwargs)) register_arch(flags=flags, checks=checks, **kwargs) if __name__ == '__main__': - print archs + print(archs) diff -Nru volk-1.3/gen/volk_compile_utils.py volk-1.4/gen/volk_compile_utils.py --- volk-1.3/gen/volk_compile_utils.py 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/gen/volk_compile_utils.py 2018-03-26 22:52:55.000000000 +0000 @@ -16,6 +16,8 @@ # along with this program. If not, see . # +from __future__ import print_function + import optparse import volk_arch_defs import volk_machine_defs @@ -26,7 +28,7 @@ if not arch.is_supported(compiler): continue fields = [arch.name] + arch.get_flags(compiler) output.append(','.join(fields)) - print ';'.join(output) + print(';'.join(output)) def do_machines_list(arch_names): output = list() @@ -34,14 +36,14 @@ machine_arch_set = set(machine.arch_names) if set(arch_names).intersection(machine_arch_set) == machine_arch_set: output.append(machine.name) - print ';'.join(output) + print(';'.join(output)) def do_machine_flags_list(compiler, machine_name): output = list() machine = volk_machine_defs.machine_dict[machine_name] for arch in machine.archs: output.extend(arch.get_flags(compiler)) - print ' '.join(output) + print(' '.join(output)) def main(): parser = optparse.OptionParser() diff -Nru volk-1.3/gen/volk_kernel_defs.py volk-1.4/gen/volk_kernel_defs.py --- volk-1.3/gen/volk_kernel_defs.py 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/gen/volk_kernel_defs.py 2018-03-26 22:52:55.000000000 +0000 @@ -19,6 +19,8 @@ # Boston, MA 02110-1301, USA. # +from __future__ import print_function + import os import re import sys @@ -98,9 +100,9 @@ def print_sections(sections, indent = ' '): for header, body in sections: if header == 'text': - print indent, ('\n'+indent).join(body.splitlines()) + print(indent, ('\n'+indent).join(body.splitlines())) continue - print indent.replace(' ', '-') + '>', header + print(indent.replace(' ', '-') + '>', header) print_sections(body, indent + ' ') ######################################################################## @@ -136,7 +138,7 @@ arg_type, arg_name = m.groups() self.args.append((arg_type, arg_name)) except Exception as ex: - raise Exception, 'I cant parse the function prototype from: %s in %s\n%s'%(kern_name, body, ex) + raise Exception('I can\'t parse the function prototype from: %s in %s\n%s'%(kern_name, body, ex)) assert self.name self.is_aligned = self.name.startswith('a_') @@ -202,8 +204,8 @@ ######################################################################## __file__ = os.path.abspath(__file__) srcdir = os.path.dirname(os.path.dirname(__file__)) -kernel_files = glob.glob(os.path.join(srcdir, "kernels", "volk", "*.h")) -kernels = map(kernel_class, kernel_files) +kernel_files = sorted(glob.glob(os.path.join(srcdir, "kernels", "volk", "*.h"))) +kernels = list(map(kernel_class, kernel_files)) if __name__ == '__main__': - print kernels + print(kernels) diff -Nru volk-1.3/gen/volk_machine_defs.py volk-1.4/gen/volk_machine_defs.py --- volk-1.3/gen/volk_machine_defs.py 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/gen/volk_machine_defs.py 2018-03-26 22:52:55.000000000 +0000 @@ -15,6 +15,10 @@ # along with this program. If not, see . # +from __future__ import print_function + +import six + from volk_arch_defs import arch_dict machines = list() @@ -67,8 +71,8 @@ except: pass kwargs['archs'] = kwargs['archs'].split() #force kwargs keys to be of type str, not unicode for py25 - kwargs = dict((str(k), v) for k, v in kwargs.iteritems()) + kwargs = dict((str(k), v) for k, v in six.iteritems(kwargs)) register_machine(**kwargs) if __name__ == '__main__': - print machines + print(machines) diff -Nru volk-1.3/gen/volk_tmpl_utils.py volk-1.4/gen/volk_tmpl_utils.py --- volk-1.3/gen/volk_tmpl_utils.py 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/gen/volk_tmpl_utils.py 2018-03-26 22:52:55.000000000 +0000 @@ -20,6 +20,8 @@ # Boston, MA 02110-1301, USA. # +from __future__ import print_function + import os import re import sys @@ -27,22 +29,7 @@ import volk_arch_defs import volk_machine_defs import volk_kernel_defs -from Cheetah import Template - -def __escape_pre_processor(code): - out = list() - for line in code.splitlines(): - m = re.match('^(\s*)#(\s*)(\w+)(.*)$', line) - if m: - p0, p1, fcn, stuff = m.groups() - conly = fcn in ('include', 'define', 'ifdef', 'ifndef', 'endif', 'elif', 'pragma') - both = fcn in ('if', 'else') - istmpl = '$' in stuff - if 'defined' in stuff: istmpl = False - if conly or (both and not istmpl): - line = '%s\\#%s%s%s'%(p0, p1, fcn, stuff) - out.append(line) - return '\n'.join(out) +from mako.template import Template def __parse_tmpl(_tmpl, **kwargs): defs = { @@ -53,13 +40,12 @@ 'kernels': volk_kernel_defs.kernels, } defs.update(kwargs) - _tmpl = __escape_pre_processor(_tmpl) _tmpl = """ /* this file was generated by volk template utils, do not edit! */ """ + _tmpl - return str(Template.Template(_tmpl, defs)) + return str(Template(_tmpl).render(**defs)) def main(): parser = optparse.OptionParser() @@ -69,6 +55,6 @@ output = __parse_tmpl(open(opts.input).read(), args=args) if opts.output: open(opts.output, 'w').write(output) - else: print output + else: print(output) if __name__ == '__main__': main() diff -Nru volk-1.3/.gitignore volk-1.4/.gitignore --- volk-1.3/.gitignore 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/.gitignore 2018-03-26 22:52:55.000000000 +0000 @@ -1,4 +1,4 @@ *~ *.pyc *.pyo -build/ +*build*/ diff -Nru volk-1.3/.gitlab-ci.yml volk-1.4/.gitlab-ci.yml --- volk-1.3/.gitlab-ci.yml 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/.gitlab-ci.yml 2018-03-26 22:52:55.000000000 +0000 @@ -0,0 +1,30 @@ +# This file is a template, and might need editing before it works on your project. +# use the official gcc image, based on debian +# can use verions as well, like gcc:5.2 +# see https://hub.docker.com/_/gcc/ +image: ubuntu:16.04 + +build: + stage: build + # instead of calling g++ directly you can also use some build toolkit like make + # install the necessary build tools when needed + before_script: + - apt update && apt -y install make cmake python python-pip libboost-all-dev && pip install six mako + script: + - mkdir build && cd build && cmake .. && make -j + artifacts: + paths: + - build/ + # depending on your build setup it's most likely a good idea to cache outputs to reduce the build time + # cache: + # paths: + # - "*.o" + +# run tests using the binary built before +test: + stage: test + before_script: + - apt update && apt -y install cmake python python-pip libboost-all-dev && pip install six mako + script: + - cd build && ctest -V + diff -Nru volk-1.3/include/volk/constants.h volk-1.4/include/volk/constants.h --- volk-1.3/include/volk/constants.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/include/volk/constants.h 2018-03-26 22:52:55.000000000 +0000 @@ -27,11 +27,11 @@ __VOLK_DECL_BEGIN -VOLK_API char* volk_prefix(); -VOLK_API char* volk_version(); -VOLK_API char* volk_c_compiler(); -VOLK_API char* volk_compiler_flags(); -VOLK_API char* volk_available_machines(); +VOLK_API const char* volk_prefix(); +VOLK_API const char* volk_version(); +VOLK_API const char* volk_c_compiler(); +VOLK_API const char* volk_compiler_flags(); +VOLK_API const char* volk_available_machines(); __VOLK_DECL_END diff -Nru volk-1.3/include/volk/volk_avx_intrinsics.h volk-1.4/include/volk/volk_avx_intrinsics.h --- volk-1.3/include/volk/volk_avx_intrinsics.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/include/volk/volk_avx_intrinsics.h 2018-03-26 22:52:55.000000000 +0000 @@ -68,4 +68,62 @@ return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2)); } +static inline __m256 +_mm256_polar_sign_mask(__m128i fbits){ + __m256 sign_mask_dummy = _mm256_setzero_ps(); + const __m128i zeros = _mm_set1_epi8(0x00); + const __m128i sign_extract = _mm_set1_epi8(0x80); + const __m128i shuffle_mask0 = _mm_setr_epi8(0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x01, 0xff, 0xff, 0xff, 0x02, 0xff, 0xff, 0xff, 0x03); + const __m128i shuffle_mask1 = _mm_setr_epi8(0xff, 0xff, 0xff, 0x04, 0xff, 0xff, 0xff, 0x05, 0xff, 0xff, 0xff, 0x06, 0xff, 0xff, 0xff, 0x07); + + fbits = _mm_cmpgt_epi8(fbits, zeros); + fbits = _mm_and_si128(fbits, sign_extract); + __m128i sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0); + __m128i sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1); + + __m256 sign_mask = _mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0); + return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1); +// // This is the desired function call. Though it seems to be missing in GCC. +// // Compare: https://software.intel.com/sites/landingpage/IntrinsicsGuide/# +// return _mm256_set_m128(_mm_castsi128_ps(sign_bits1), _mm_castsi128_ps(sign_bits0)); +} + +static inline void +_mm256_polar_deinterleave(__m256 *llr0, __m256 *llr1, __m256 src0, __m256 src1){ + // deinterleave values + __m256 part0 = _mm256_permute2f128_ps(src0, src1, 0x20); + __m256 part1 = _mm256_permute2f128_ps(src0, src1, 0x31); + *llr0 = _mm256_shuffle_ps(part0, part1, 0x88); + *llr1 = _mm256_shuffle_ps(part0, part1, 0xdd); +} + +static inline __m256 +_mm256_polar_minsum_llrs(__m256 src0, __m256 src1){ + const __m256 sign_mask = _mm256_set1_ps(-0.0f); + const __m256 abs_mask = _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff))); + + __m256 llr0, llr1; + _mm256_polar_deinterleave(&llr0, &llr1, src0, src1); + + // calculate result + __m256 sign = _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask)); + __m256 dst = _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask)); + return _mm256_or_ps(dst, sign); +} + +static inline __m256 +_mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits){ + // prepare sign mask for correct +- + __m256 sign_mask = _mm256_polar_sign_mask(fbits); + + __m256 llr0, llr1; + _mm256_polar_deinterleave(&llr0, &llr1, src0, src1); + + // calculate result + llr0 = _mm256_xor_ps(llr0, sign_mask); + __m256 dst = _mm256_add_ps(llr0, llr1); + return dst; +} + + #endif /* INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ */ diff -Nru volk-1.3/include/volk/volk_common.h volk-1.4/include/volk/volk_common.h --- volk-1.3/include/volk/volk_common.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/include/volk/volk_common.h 2018-03-26 22:52:55.000000000 +0000 @@ -9,6 +9,8 @@ # define __VOLK_ATTR_UNUSED __attribute__((unused)) # define __VOLK_ATTR_INLINE __attribute__((always_inline)) # define __VOLK_ATTR_DEPRECATED __attribute__((deprecated)) +# define __VOLK_ASM __asm__ +# define __VOLK_VOLATILE __volatile__ # if __GNUC__ >= 4 # define __VOLK_ATTR_EXPORT __attribute__((visibility("default"))) # define __VOLK_ATTR_IMPORT __attribute__((visibility("default"))) @@ -16,6 +18,7 @@ # define __VOLK_ATTR_EXPORT # define __VOLK_ATTR_IMPORT # endif +# define __VOLK_PREFETCH(addr) __builtin_prefetch(addr) #elif _MSC_VER # define __VOLK_ATTR_ALIGNED(x) __declspec(align(x)) # define __VOLK_ATTR_UNUSED @@ -23,6 +26,9 @@ # define __VOLK_ATTR_DEPRECATED __declspec(deprecated) # define __VOLK_ATTR_EXPORT __declspec(dllexport) # define __VOLK_ATTR_IMPORT __declspec(dllimport) +# define __VOLK_PREFETCH(addr) +# define __VOLK_ASM __asm +# define __VOLK_VOLATILE #else # define __VOLK_ATTR_ALIGNED(x) # define __VOLK_ATTR_UNUSED @@ -30,6 +36,9 @@ # define __VOLK_ATTR_DEPRECATED # define __VOLK_ATTR_EXPORT # define __VOLK_ATTR_IMPORT +# define __VOLK_PREFETCH(addr) +# define __VOLK_ASM __asm__ +# define __VOLK_VOLATILE __volatile__ #endif //////////////////////////////////////////////////////////////////////// diff -Nru volk-1.3/include/volk/volk_complex.h volk-1.4/include/volk/volk_complex.h --- volk-1.3/include/volk/volk_complex.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/include/volk/volk_complex.h 2018-03-26 22:52:55.000000000 +0000 @@ -47,6 +47,11 @@ #else /* __cplusplus */ +#if __STDC_VERSION__ >= 199901L /* C99 check */ +/* this allows us to conj in lv_conj without the double detour for single-precision floats */ +#include +#endif /* C99 check */ + #include typedef char complex lv_8sc_t; @@ -70,7 +75,9 @@ #define lv_conj(x) (~(x)) // When not available, use the c99 complex function family, -// which always returns double regardless of the input type. +// which always returns double regardless of the input type, +// unless we have C99 and thus tgmath.h overriding functions +// with type-generic versions. #else /* __GNUC__ */ #define lv_creal(x) (creal(x)) diff -Nru volk-1.3/include/volk/volk_malloc.h volk-1.4/include/volk/volk_malloc.h --- volk-1.3/include/volk/volk_malloc.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/include/volk/volk_malloc.h 2018-03-26 22:52:55.000000000 +0000 @@ -36,7 +36,7 @@ * memory that are guaranteed to be on an alignment, VOLK handles this * itself. The volk_malloc function behaves like malloc in that it * returns a pointer to the allocated memory. However, it also takes - * in an alignment specfication, which is usually something like 16 or + * in an alignment specification, which is usually something like 16 or * 32 to ensure that the aligned memory is located on a particular * byte boundary for use with SIMD. * diff -Nru volk-1.3/kernels/README.txt volk-1.4/kernels/README.txt --- volk-1.3/kernels/README.txt 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/README.txt 2018-03-26 22:52:55.000000000 +0000 @@ -3,7 +3,7 @@ ######################################################################## A kernel dispatcher is kernel implementation that calls other kernel implementations. By default, a dispatcher is generated by the build system for every kernel such that: - * the best aligned implemention is called when all pointer arguments are aligned, + * the best aligned implementation is called when all pointer arguments are aligned, * and otherwise the best unaligned implementation is called. The author of a VOLK kernel may create a custom dispatcher, diff -Nru volk-1.3/kernels/volk/asm/neon/volk_16i_max_star_horizontal_16i.s volk-1.4/kernels/volk/asm/neon/volk_16i_max_star_horizontal_16i.s --- volk-1.3/kernels/volk/asm/neon/volk_16i_max_star_horizontal_16i.s 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/asm/neon/volk_16i_max_star_horizontal_16i.s 2018-03-26 22:52:55.000000000 +0000 @@ -1,11 +1,11 @@ -@ static inline void volk_16i_max_star_horizontal_16i_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); - .global volk_16i_max_star_horizontal_16i_neonasm -volk_16i_max_star_horizontal_16i_neonasm: +@ static inline void volk_16i_max_star_horizontal_16i_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); + .global volk_16i_max_star_horizontal_16i_a_neonasm +volk_16i_max_star_horizontal_16i_a_neonasm: @ r0 - cVector: pointer to output array @ r1 - aVector: pointer to input array 1 @ r2 - num_points: number of items to process -volk_16i_max_star_horizontal_16i_neonasm: +volk_16i_max_star_horizontal_16i_a_neonasm: pld [r1:128] push {r4, r5, r6} @ preserve register states lsrs r5, r2, #4 @ 1/16th points = num_points/16 @@ -28,7 +28,7 @@ vadd.i16 q10, q11, q10 @ add results to get max vst1.16 {d20-d21}, [r12]! @ store the results bne .loop1 @ at least 16 items left - add r1, r1, r3, lsl #5 + add r1, r1, r3, lsl #5 add r0, r0, r3, lsl #4 .smallvector: ands r2, r2, #15 diff -Nru volk-1.3/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_unrollasm.s volk-1.4/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_unrollasm.s --- volk-1.3/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_unrollasm.s 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_unrollasm.s 2018-03-26 22:52:55.000000000 +0000 @@ -0,0 +1,146 @@ +@ static inline void volk_32fc_32f_dot_prod_32fc_a_unrollasm ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) +.global volk_32fc_32f_dot_prod_32fc_a_unrollasm +volk_32fc_32f_dot_prod_32fc_a_unrollasm: + @ r0 - result: pointer to output array (32fc) + @ r1 - input: pointer to input array 1 (32fc) + @ r2 - taps: pointer to input array 2 (32f) + @ r3 - num_points: number of items to process + + push {r4, r5, r6, r7, r8, r9} + vpush {q4-q7} + sub r13, r13, #56 @ 0x38 + add r12, r13, #8 + lsrs r8, r3, #3 + veor.32 q2, q5, q5 + veor.32 q3, q5, q5 + veor.32 q4, q5, q5 + veor.32 q5, q5, q5 + beq .smallvector + vld2.32 {d20-d23}, [r1]! + vld1.32 {d24-d25}, [r2]! + mov r5, #1 + + + +.mainloop: + vld2.32 {d14-d17}, [r1]! @ q7,q8 + vld1.32 {d18-d19}, [r2]! @ q9 + + vmul.f32 q0, q12, q10 @ real mult + vmul.f32 q1, q12, q11 @ imag mult + + add r5, r5, #1 + cmp r5, r8 + + vadd.f32 q4, q4, q0@ q4 accumulates real + vadd.f32 q5, q5, q1@ q5 accumulates imag + + vld2.32 {d20-d23}, [r1]! @ q10-q11 + vld1.32 {d24-d25}, [r2]! @ q12 + + vmul.f32 q13, q9, q7 + vmul.f32 q14, q9, q8 + vadd.f32 q2, q2, q13 @ q2 accumulates real + vadd.f32 q3, q3, q14 @ q3 accumulates imag + + + + bne .mainloop + + vmul.f32 q0, q12, q10 @ real mult + vmul.f32 q1, q12, q11 @ imag mult + + vadd.f32 q4, q4, q0@ q4 accumulates real + vadd.f32 q5, q5, q1@ q5 accumulates imag + + +.smallvector: + vadd.f32 q0, q2, q4 + add r12, r13, #24 + lsl r8, r8, #3 + vadd.f32 q1, q3, q5 + cmp r3, r8 + + vadd.f32 d0, d0, d1 + vadd.f32 d1, d2, d3 + vadd.f32 s14, s0, s1 + vadd.f32 s15, s2, s3 + + vstr s14, [r13] + vstr s15, [r13, #4] + bls .D1 + rsb r12, r8, r3 + lsr r4, r12, #2 + cmp r4, #0 + cmpne r12, #3 + lsl r5, r4, #2 + movhi r6, #0 + movls r6, #1 + bls .L1 + vmov.i32 q10, #0 @ 0x00000000 + mov r9, r1 + mov r7, r2 + vorr q11, q10, q10 + +.smallloop: + add r6, r6, #1 + vld2.32 {d16-d19}, [r9]! + cmp r4, r6 + vld1.32 {d24-d25}, [r7]! + vmla.f32 q11, q12, q8 + vmla.f32 q10, q12, q9 + bhi .smallloop + vmov.i32 q9, #0 @ 0x00000000 + cmp r12, r5 + vadd.f32 d20, d20, d21 + add r8, r8, r5 + vorr q8, q9, q9 + add r1, r1, r5, lsl #3 + vadd.f32 d22, d22, d23 + add r2, r2, r5, lsl #2 + vpadd.f32 d18, d20, d20 + vpadd.f32 d16, d22, d22 + vmov.32 r4, d18[0] + vmov.32 r12, d16[0] + vmov s13, r4 + vadd.f32 s15, s13, s15 + vmov s13, r12 + vadd.f32 s14, s13, s14 + beq .finishreduction + .L1: + add r12, r8, #1 + vldr s13, [r2] + cmp r3, r12 + vldr s11, [r1] + vldr s12, [r1, #4] + vmla.f32 s14, s13, s11 + vmla.f32 s15, s13, s12 + bls .finishreduction + add r8, r8, #2 + vldr s13, [r2, #4] + cmp r3, r8 + vldr s11, [r1, #8] + vldr s12, [r1, #12] + vmla.f32 s14, s13, s11 + vmla.f32 s15, s13, s12 + bls .finishreduction + vldr s13, [r2, #8] + vldr s11, [r1, #16] + vldr s12, [r1, #20] + vmla.f32 s14, s13, s11 + vmla.f32 s15, s13, s12 + +.finishreduction: + vstr s14, [r13] + vstr s15, [r13, #4] + .D1: + ldr r3, [r13] + str r3, [r0] + ldr r3, [r13, #4] + str r3, [r0, #4] + add r13, r13, #56 @ 0x38 + vpop {q4-q7} + pop {r4, r5, r6, r7, r8, r9} + bx r14 + + diff -Nru volk-1.3/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_unrollasm.s volk-1.4/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_unrollasm.s --- volk-1.3/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_unrollasm.s 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_unrollasm.s 1970-01-01 00:00:00.000000000 +0000 @@ -1,146 +0,0 @@ -@ static inline void volk_32fc_32f_dot_prod_32fc_unrollasm ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) -.global volk_32fc_32f_dot_prod_32fc_unrollasm -volk_32fc_32f_dot_prod_32fc_unrollasm: - @ r0 - result: pointer to output array (32fc) - @ r1 - input: pointer to input array 1 (32fc) - @ r2 - taps: pointer to input array 2 (32f) - @ r3 - num_points: number of items to process - - push {r4, r5, r6, r7, r8, r9} - vpush {q4-q7} - sub r13, r13, #56 @ 0x38 - add r12, r13, #8 - lsrs r8, r3, #3 - veor.32 q2, q5, q5 - veor.32 q3, q5, q5 - veor.32 q4, q5, q5 - veor.32 q5, q5, q5 - beq .smallvector - vld2.32 {d20-d23}, [r1]! - vld1.32 {d24-d25}, [r2]! - mov r5, #1 - - - -.mainloop: - vld2.32 {d14-d17}, [r1]! @ q7,q8 - vld1.32 {d18-d19}, [r2]! @ q9 - - vmul.f32 q0, q12, q10 @ real mult - vmul.f32 q1, q12, q11 @ imag mult - - add r5, r5, #1 - cmp r5, r8 - - vadd.f32 q4, q4, q0@ q4 accumulates real - vadd.f32 q5, q5, q1@ q5 accumulates imag - - vld2.32 {d20-d23}, [r1]! @ q10-q11 - vld1.32 {d24-d25}, [r2]! @ q12 - - vmul.f32 q13, q9, q7 - vmul.f32 q14, q9, q8 - vadd.f32 q2, q2, q13 @ q2 accumulates real - vadd.f32 q3, q3, q14 @ q3 accumulates imag - - - - bne .mainloop - - vmul.f32 q0, q12, q10 @ real mult - vmul.f32 q1, q12, q11 @ imag mult - - vadd.f32 q4, q4, q0@ q4 accumulates real - vadd.f32 q5, q5, q1@ q5 accumulates imag - - -.smallvector: - vadd.f32 q0, q2, q4 - add r12, r13, #24 - lsl r8, r8, #3 - vadd.f32 q1, q3, q5 - cmp r3, r8 - - vadd.f32 d0, d0, d1 - vadd.f32 d1, d2, d3 - vadd.f32 s14, s0, s1 - vadd.f32 s15, s2, s3 - - vstr s14, [r13] - vstr s15, [r13, #4] - bls .D1 - rsb r12, r8, r3 - lsr r4, r12, #2 - cmp r4, #0 - cmpne r12, #3 - lsl r5, r4, #2 - movhi r6, #0 - movls r6, #1 - bls .L1 - vmov.i32 q10, #0 @ 0x00000000 - mov r9, r1 - mov r7, r2 - vorr q11, q10, q10 - -.smallloop: - add r6, r6, #1 - vld2.32 {d16-d19}, [r9]! - cmp r4, r6 - vld1.32 {d24-d25}, [r7]! - vmla.f32 q11, q12, q8 - vmla.f32 q10, q12, q9 - bhi .smallloop - vmov.i32 q9, #0 @ 0x00000000 - cmp r12, r5 - vadd.f32 d20, d20, d21 - add r8, r8, r5 - vorr q8, q9, q9 - add r1, r1, r5, lsl #3 - vadd.f32 d22, d22, d23 - add r2, r2, r5, lsl #2 - vpadd.f32 d18, d20, d20 - vpadd.f32 d16, d22, d22 - vmov.32 r4, d18[0] - vmov.32 r12, d16[0] - vmov s13, r4 - vadd.f32 s15, s13, s15 - vmov s13, r12 - vadd.f32 s14, s13, s14 - beq .finishreduction - .L1: - add r12, r8, #1 - vldr s13, [r2] - cmp r3, r12 - vldr s11, [r1] - vldr s12, [r1, #4] - vmla.f32 s14, s13, s11 - vmla.f32 s15, s13, s12 - bls .finishreduction - add r8, r8, #2 - vldr s13, [r2, #4] - cmp r3, r8 - vldr s11, [r1, #8] - vldr s12, [r1, #12] - vmla.f32 s14, s13, s11 - vmla.f32 s15, s13, s12 - bls .finishreduction - vldr s13, [r2, #8] - vldr s11, [r1, #16] - vldr s12, [r1, #20] - vmla.f32 s14, s13, s11 - vmla.f32 s15, s13, s12 - -.finishreduction: - vstr s14, [r13] - vstr s15, [r13, #4] - .D1: - ldr r3, [r13] - str r3, [r0] - ldr r3, [r13, #4] - str r3, [r0, #4] - add r13, r13, #56 @ 0x38 - vpop {q4-q7} - pop {r4, r5, r6, r7, r8, r9} - bx r14 - - diff -Nru volk-1.3/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_a_neonasm_opttests.s volk-1.4/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_a_neonasm_opttests.s --- volk-1.3/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_a_neonasm_opttests.s 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_a_neonasm_opttests.s 2018-03-26 22:52:55.000000000 +0000 @@ -0,0 +1,96 @@ +@ static inline void volk_32fc_x2_dot_prod_32fc_a_neonasm_opttests(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)@ +.global volk_32fc_x2_dot_prod_32fc_a_neonasm_opttests +volk_32fc_x2_dot_prod_32fc_a_neonasm_opttests: + push {r4, r5, r6, r7, r8, r9, sl, fp, lr} + vpush {d8-d15} + lsrs fp, r3, #3 + sub sp, sp, #52 @ 0x34 + mov r9, r3 + mov sl, r0 + mov r7, r1 + mov r8, r2 + vorr q0, q7, q7 + vorr q1, q7, q7 + vorr q2, q7, q7 + vorr q3, q7, q7 + vorr q4, q7, q7 + vorr q5, q7, q7 + veor q6, q7, q7 + vorr q7, q7, q7 + beq .smallvector + mov r4, r1 + mov ip, r2 + mov r3, #0 +.mainloop: + @mov r6, ip + @mov r5, r4 + vld4.32 {d24,d26,d28,d30}, [r6]! + @add ip, ip, #64 @ 0x40 + @add r4, r4, #64 @ 0x40 + vld4.32 {d16,d18,d20,d22}, [r5]! + add r3, r3, #1 + vld4.32 {d25,d27,d29,d31}, [r6]! + vld4.32 {d17,d19,d21,d23}, [r5]! + vmla.f32 q6, q8, q12 + vmla.f32 q0, q9, q12 + cmp r3, fp + vmls.f32 q5, q13, q9 + vmla.f32 q2, q13, q8 + vmla.f32 q7, q10, q14 + vmla.f32 q1, q11, q14 + vmls.f32 q4, q15, q11 + vmla.f32 q3, q15, q10 + bne .mainloop + lsl r3, fp, #6 + add r8, r8, r3 + add r7, r7, r3 +.smallvector: + vadd.f32 q3, q2, q3 + add r3, sp, #16 + lsl r4, fp, #3 + vadd.f32 q4, q5, q4 + cmp r9, r4 + vadd.f32 q6, q6, q7 + vadd.f32 q1, q0, q1 + vadd.f32 q8, q6, q4 + vadd.f32 q9, q1, q3 + vst2.32 {d16-d19}, [r3 :64] + vldr s15, [sp, #24] + vldr s16, [sp, #16] + vldr s17, [sp, #20] + vadd.f32 s16, s16, s15 + vldr s11, [sp, #28] + vldr s12, [sp, #40] @ 0x28 + vldr s13, [sp, #44] @ 0x2c + vldr s14, [sp, #32] + vldr s15, [sp, #36] @ 0x24 + vadd.f32 s17, s17, s11 + vadd.f32 s16, s16, s12 + vadd.f32 s17, s17, s13 + vadd.f32 s16, s16, s14 + vadd.f32 s17, s17, s15 + vstr s16, [sl] + vstr s17, [sl, #4] + bls .epilog + add r5, sp, #8 +.tailcase: + ldr r3, [r7], #8 + mov r0, r5 + ldr r1, [r8], #8 + add r4, r4, #1 + ldr ip, [r7, #-4] + ldr r2, [r8, #-4] + str ip, [sp] + bl __mulsc3 + vldr s14, [sp, #8] + vldr s15, [sp, #12] + vadd.f32 s16, s16, s14 + cmp r4, r9 + vadd.f32 s17, s17, s15 + vstr s16, [sl] + vstr s17, [sl, #4] + bne .tailcase +.epilog: + add sp, sp, #52 @ 0x34 + vpop {d8-d15} + pop {r4, r5, r6, r7, r8, r9, sl, fp, pc} diff -Nru volk-1.3/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_a_neonasm.s volk-1.4/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_a_neonasm.s --- volk-1.3/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_a_neonasm.s 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_a_neonasm.s 2018-03-26 22:52:55.000000000 +0000 @@ -0,0 +1,98 @@ +@ static inline void volk_32fc_x2_dot_prod_32fc_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); + .global volk_32fc_x2_dot_prod_32fc_neonasm +volk_32fc_x2_dot_prod_32fc_neonasm: + push {r4, r5, r6, r7, r8, lr} + vpush {q0-q7} + vpush {q8-q15} + mov r8, r3 @ hold on to num_points (r8) + @ zero out accumulators -- leave 1 reg in alu + veor q8, q15, q15 + mov r7, r0 @ (r7) is cVec + veor q9, q15, q15 + mov r5, r1 @ (r5) is aVec + veor q10, q15, q15 + mov r6, r2 @ (r6) is bVec + veor q11, q15, q15 + lsrs r3, r3, #3 @ eighth_points (r3) = num_points/8 + veor q12, q15, q15 + mov r12, r2 @ (r12) is bVec + veor q13, q15, q15 + mov r4, r1 @ (r4) is aVec + veor q14, q15, q15 + veor q15, q15, q15 + beq .smallvector @ nathan optimized this file based on an objdump + @ but I don't understand this jump. Seems like it should go to loop2 + @ and smallvector (really vector reduction) shouldn't need to be a label + mov r2, #0 @ 0 out r2 (now number) +.loop1: + add r2, r2, #1 @ increment number + vld4.32 {d0,d2,d4,d6}, [r12]! @ q0-q3 + cmp r2, r3 @ is number < eighth_points + @pld [r12, #64] + vld4.32 {d8,d10,d12,d14}, [r4]! @ q4-q7 + @pld [r4, #64] + vmla.f32 q12, q4, q0 @ real (re*re) + vmla.f32 q14, q4, q1 @ imag (re*im) + vmls.f32 q15, q5, q1 @ real (im*im) + vmla.f32 q13, q5, q0 @ imag (im*re) + + vmla.f32 q8, q2, q6 @ real (re*re) + vmla.f32 q9, q2, q7 @ imag (re*im) + vmls.f32 q10, q3, q7 @ real (im*im) + vmla.f32 q11, q3, q6 @ imag (im*re) + bne .loop1 + lsl r2, r3, #3 @ r2 = eighth_points * 8 + add r6, r6, r2 @ bVec = bVec + eighth_points -- whyyyyy gcc?!? + add r5, r5, r2 @ aVec = aVec + eighth_points + @ q12-q13 were original real accumulators + @ q14-q15 were original imag accumulators + @ reduce 8 accumulators down to 2 (1 real, 1 imag) + vadd.f32 q8, q10, q8 @ real + real + vadd.f32 q11, q11, q9 @ imag + imag + vadd.f32 q12, q12, q15 @ real + real + vadd.f32 q14, q14, q13 @ imag + imag + vadd.f32 q8, q8, q12 + vadd.f32 q9, q9, q14 +.smallvector: + lsl r4, r3, #3 + cmp r8, r4 + vst2.32 {d16-d19}, [sp :64] @ whaaaaat? no way this is necessary! + vldr s15, [sp, #8] + vldr s17, [sp] + vldr s16, [sp, #4] + vadd.f32 s17, s17, s15 + vldr s11, [sp, #12] + vldr s12, [sp, #24] + vldr s13, [sp, #28] + vldr s14, [sp, #16] + vldr s15, [sp, #20] + vadd.f32 s16, s16, s11 + vadd.f32 s17, s17, s12 + vadd.f32 s16, s16, s13 + vadd.f32 s17, s17, s14 + vadd.f32 s16, s16, s15 + vstr s17, [r7] + vstr s16, [r7, #4] + bls .done +.loop2: + mov r3, r6 + add r6, r6, #8 + vldr s0, [r3] + vldr s1, [r6, #-4] + mov r3, r5 + add r5, r5, #8 + vldr s2, [r3] + vldr s3, [r5, #-4] + bl __mulsc3 @ GCC/Clang built-in. Portability? + add r4, r4, #1 + cmp r4, r8 + vadd.f32 s17, s17, s0 + vadd.f32 s16, s16, s1 + vstr s17, [r7] + vstr s16, [r7, #4] + bne .loop2 +.done: + vpop {q8-q15} + vpop {q0-q7} + pop {r4, r5, r6, r7, r8, pc} + diff -Nru volk-1.3/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_neonasm_opttests.s volk-1.4/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_neonasm_opttests.s --- volk-1.3/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_neonasm_opttests.s 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_neonasm_opttests.s 1970-01-01 00:00:00.000000000 +0000 @@ -1,96 +0,0 @@ -@ static inline void volk_32fc_x2_dot_prod_32fc_neonasm_opttests(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)@ -.global volk_32fc_x2_dot_prod_32fc_neonasm_opttests -volk_32fc_x2_dot_prod_32fc_neonasm_opttests: - push {r4, r5, r6, r7, r8, r9, sl, fp, lr} - vpush {d8-d15} - lsrs fp, r3, #3 - sub sp, sp, #52 @ 0x34 - mov r9, r3 - mov sl, r0 - mov r7, r1 - mov r8, r2 - vorr q0, q7, q7 - vorr q1, q7, q7 - vorr q2, q7, q7 - vorr q3, q7, q7 - vorr q4, q7, q7 - vorr q5, q7, q7 - veor q6, q7, q7 - vorr q7, q7, q7 - beq .smallvector - mov r4, r1 - mov ip, r2 - mov r3, #0 -.mainloop: - @mov r6, ip - @mov r5, r4 - vld4.32 {d24,d26,d28,d30}, [r6]! - @add ip, ip, #64 @ 0x40 - @add r4, r4, #64 @ 0x40 - vld4.32 {d16,d18,d20,d22}, [r5]! - add r3, r3, #1 - vld4.32 {d25,d27,d29,d31}, [r6]! - vld4.32 {d17,d19,d21,d23}, [r5]! - vmla.f32 q6, q8, q12 - vmla.f32 q0, q9, q12 - cmp r3, fp - vmls.f32 q5, q13, q9 - vmla.f32 q2, q13, q8 - vmla.f32 q7, q10, q14 - vmla.f32 q1, q11, q14 - vmls.f32 q4, q15, q11 - vmla.f32 q3, q15, q10 - bne .mainloop - lsl r3, fp, #6 - add r8, r8, r3 - add r7, r7, r3 -.smallvector: - vadd.f32 q3, q2, q3 - add r3, sp, #16 - lsl r4, fp, #3 - vadd.f32 q4, q5, q4 - cmp r9, r4 - vadd.f32 q6, q6, q7 - vadd.f32 q1, q0, q1 - vadd.f32 q8, q6, q4 - vadd.f32 q9, q1, q3 - vst2.32 {d16-d19}, [r3 :64] - vldr s15, [sp, #24] - vldr s16, [sp, #16] - vldr s17, [sp, #20] - vadd.f32 s16, s16, s15 - vldr s11, [sp, #28] - vldr s12, [sp, #40] @ 0x28 - vldr s13, [sp, #44] @ 0x2c - vldr s14, [sp, #32] - vldr s15, [sp, #36] @ 0x24 - vadd.f32 s17, s17, s11 - vadd.f32 s16, s16, s12 - vadd.f32 s17, s17, s13 - vadd.f32 s16, s16, s14 - vadd.f32 s17, s17, s15 - vstr s16, [sl] - vstr s17, [sl, #4] - bls .epilog - add r5, sp, #8 -.tailcase: - ldr r3, [r7], #8 - mov r0, r5 - ldr r1, [r8], #8 - add r4, r4, #1 - ldr ip, [r7, #-4] - ldr r2, [r8, #-4] - str ip, [sp] - bl __mulsc3 - vldr s14, [sp, #8] - vldr s15, [sp, #12] - vadd.f32 s16, s16, s14 - cmp r4, r9 - vadd.f32 s17, s17, s15 - vstr s16, [sl] - vstr s17, [sl, #4] - bne .tailcase -.epilog: - add sp, sp, #52 @ 0x34 - vpop {d8-d15} - pop {r4, r5, r6, r7, r8, r9, sl, fp, pc} diff -Nru volk-1.3/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_neonasm.s volk-1.4/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_neonasm.s --- volk-1.3/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_neonasm.s 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_neonasm.s 1970-01-01 00:00:00.000000000 +0000 @@ -1,98 +0,0 @@ -@ static inline void volk_32fc_x2_dot_prod_32fc_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); - .global volk_32fc_x2_dot_prod_32fc_neonasm -volk_32fc_x2_dot_prod_32fc_neonasm: - push {r4, r5, r6, r7, r8, lr} - vpush {q0-q7} - vpush {q8-q15} - mov r8, r3 @ hold on to num_points (r8) - @ zero out accumulators -- leave 1 reg in alu - veor q8, q15, q15 - mov r7, r0 @ (r7) is cVec - veor q9, q15, q15 - mov r5, r1 @ (r5) is aVec - veor q10, q15, q15 - mov r6, r2 @ (r6) is bVec - veor q11, q15, q15 - lsrs r3, r3, #3 @ eighth_points (r3) = num_points/8 - veor q12, q15, q15 - mov r12, r2 @ (r12) is bVec - veor q13, q15, q15 - mov r4, r1 @ (r4) is aVec - veor q14, q15, q15 - veor q15, q15, q15 - beq .smallvector @ nathan optimized this file based on an objdump - @ but I don't understand this jump. Seems like it should go to loop2 - @ and smallvector (really vector reduction) shouldn't need to be a label - mov r2, #0 @ 0 out r2 (now number) -.loop1: - add r2, r2, #1 @ increment number - vld4.32 {d0,d2,d4,d6}, [r12]! @ q0-q3 - cmp r2, r3 @ is number < eighth_points - @pld [r12, #64] - vld4.32 {d8,d10,d12,d14}, [r4]! @ q4-q7 - @pld [r4, #64] - vmla.f32 q12, q4, q0 @ real (re*re) - vmla.f32 q14, q4, q1 @ imag (re*im) - vmls.f32 q15, q5, q1 @ real (im*im) - vmla.f32 q13, q5, q0 @ imag (im*re) - - vmla.f32 q8, q2, q6 @ real (re*re) - vmla.f32 q9, q2, q7 @ imag (re*im) - vmls.f32 q10, q3, q7 @ real (im*im) - vmla.f32 q11, q3, q6 @ imag (im*re) - bne .loop1 - lsl r2, r3, #3 @ r2 = eighth_points * 8 - add r6, r6, r2 @ bVec = bVec + eighth_points -- whyyyyy gcc?!? - add r5, r5, r2 @ aVec = aVec + eighth_points - @ q12-q13 were original real accumulators - @ q14-q15 were original imag accumulators - @ reduce 8 accumulators down to 2 (1 real, 1 imag) - vadd.f32 q8, q10, q8 @ real + real - vadd.f32 q11, q11, q9 @ imag + imag - vadd.f32 q12, q12, q15 @ real + real - vadd.f32 q14, q14, q13 @ imag + imag - vadd.f32 q8, q8, q12 - vadd.f32 q9, q9, q14 -.smallvector: - lsl r4, r3, #3 - cmp r8, r4 - vst2.32 {d16-d19}, [sp :64] @ whaaaaat? no way this is necessary! - vldr s15, [sp, #8] - vldr s17, [sp] - vldr s16, [sp, #4] - vadd.f32 s17, s17, s15 - vldr s11, [sp, #12] - vldr s12, [sp, #24] - vldr s13, [sp, #28] - vldr s14, [sp, #16] - vldr s15, [sp, #20] - vadd.f32 s16, s16, s11 - vadd.f32 s17, s17, s12 - vadd.f32 s16, s16, s13 - vadd.f32 s17, s17, s14 - vadd.f32 s16, s16, s15 - vstr s17, [r7] - vstr s16, [r7, #4] - bls .done -.loop2: - mov r3, r6 - add r6, r6, #8 - vldr s0, [r3] - vldr s1, [r6, #-4] - mov r3, r5 - add r5, r5, #8 - vldr s2, [r3] - vldr s3, [r5, #-4] - bl __mulsc3 @ GCC/Clang built-in. Portability? - add r4, r4, #1 - cmp r4, r8 - vadd.f32 s17, s17, s0 - vadd.f32 s16, s16, s1 - vstr s17, [r7] - vstr s16, [r7, #4] - bne .loop2 -.done: - vpop {q8-q15} - vpop {q0-q7} - pop {r4, r5, r6, r7, r8, pc} - diff -Nru volk-1.3/kernels/volk/asm/neon/volk_32fc_x2_multiply_32fc_a_neonasm.s volk-1.4/kernels/volk/asm/neon/volk_32fc_x2_multiply_32fc_a_neonasm.s --- volk-1.3/kernels/volk/asm/neon/volk_32fc_x2_multiply_32fc_a_neonasm.s 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/kernels/volk/asm/neon/volk_32fc_x2_multiply_32fc_a_neonasm.s 2018-03-26 22:52:55.000000000 +0000 @@ -0,0 +1,47 @@ +@ static inline void volk_32fc_x2_multiply_32fc_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); + .global volk_32fc_x2_multiply_32fc_a_neonasm +volk_32fc_x2_multiply_32fc_a_neonasm: + push {r4, r5, r6, r7, r8, r9, r14} + lsrs r7, r3, #2 + @ r0 is c vector + @ r1 is a vector + @ r2 is b vector + @ r3 is num_points + @ r7 is quarter_points + beq .smallvector + mov r5, #0 +.mainloop: + vld2.32 {d24-d27}, [r1]! @ ar=q12, ai=q13 + add r5, r5, #1 + cmp r5, r7 + vld2.32 {d20-d23}, [r2]! @ br=q10, bi=q11 + pld [r1] + pld [r2] + vmul.f32 q0, q12, q10 @ q15 = ar*br + vmul.f32 q1, q13, q11 @ q11 = ai*bi + vmul.f32 q2, q12, q11 @ q14 = ar*bi + vmul.f32 q3, q13, q10 @ q12 = ai*br + vsub.f32 q9, q0, q1 @ real + vadd.f32 q10, q2, q3 @ imag + vst2.32 {q9-q10}, [r0]! + bne .mainloop + +.smallvector: + lsl r5, r7, #2 @ r5 = quarter_points * 4 + cmp r3, r5 @ num_points == quarter_points? + bls .done +.tailcase: + add r5, r5, #1 @ r5 +=1 <- number++ + vld1.32 d1, [r1]! @ s2, s3 = ar, ai + vld1.32 d0, [r2]! @ s0, s1 = br, bi + vmul.f32 s4, s0, s2 @ s4 = ar*br + vmul.f32 s5, s0, s3 @ s5 = ar*bi + vmls.f32 s4, s1, s3 @ s4 = s4 - ai*bi + vmla.f32 s5, s1, s2 @ s5 = s5 + ai*br + @vst2.32 d2[0], [r0]! + vst1.32 {d2}, [r0]! + cmp r3, r5 @ r3 == r5? num_points == number? + bne .tailcase +.done: + pop {r4, r5, r6, r7, r8, r9, r15} + bx lr diff -Nru volk-1.3/kernels/volk/asm/neon/volk_32fc_x2_multiply_32fc_neonasm.s volk-1.4/kernels/volk/asm/neon/volk_32fc_x2_multiply_32fc_neonasm.s --- volk-1.3/kernels/volk/asm/neon/volk_32fc_x2_multiply_32fc_neonasm.s 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/asm/neon/volk_32fc_x2_multiply_32fc_neonasm.s 1970-01-01 00:00:00.000000000 +0000 @@ -1,47 +0,0 @@ -@ static inline void volk_32fc_x2_multiply_32fc_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); - .global volk_32fc_x2_multiply_32fc_neonasm -volk_32fc_x2_multiply_32fc_neonasm: - push {r4, r5, r6, r7, r8, r9, r14} - lsrs r7, r3, #2 - @ r0 is c vector - @ r1 is a vector - @ r2 is b vector - @ r3 is num_points - @ r7 is quarter_points - beq .smallvector - mov r5, #0 -.mainloop: - vld2.32 {d24-d27}, [r1]! @ ar=q12, ai=q13 - add r5, r5, #1 - cmp r5, r7 - vld2.32 {d20-d23}, [r2]! @ br=q10, bi=q11 - pld [r1] - pld [r2] - vmul.f32 q0, q12, q10 @ q15 = ar*br - vmul.f32 q1, q13, q11 @ q11 = ai*bi - vmul.f32 q2, q12, q11 @ q14 = ar*bi - vmul.f32 q3, q13, q10 @ q12 = ai*br - vsub.f32 q9, q0, q1 @ real - vadd.f32 q10, q2, q3 @ imag - vst2.32 {q9-q10}, [r0]! - bne .mainloop - -.smallvector: - lsl r5, r7, #2 @ r5 = quarter_points * 4 - cmp r3, r5 @ num_points == quarter_points? - bls .done -.tailcase: - add r5, r5, #1 @ r5 +=1 <- number++ - vld1.32 d1, [r1]! @ s2, s3 = ar, ai - vld1.32 d0, [r2]! @ s0, s1 = br, bi - vmul.f32 s4, s0, s2 @ s4 = ar*br - vmul.f32 s5, s0, s3 @ s5 = ar*bi - vmls.f32 s4, s1, s3 @ s4 = s4 - ai*bi - vmla.f32 s5, s1, s2 @ s5 = s5 + ai*br - @vst2.32 d2[0], [r0]! - vst1.32 {d2}, [r0]! - cmp r3, r5 @ r3 == r5? num_points == number? - bne .tailcase -.done: - pop {r4, r5, r6, r7, r8, r9, r15} - bx lr diff -Nru volk-1.3/kernels/volk/asm/neon/volk_32f_s32f_multiply_32f_a_neonasm.s volk-1.4/kernels/volk/asm/neon/volk_32f_s32f_multiply_32f_a_neonasm.s --- volk-1.3/kernels/volk/asm/neon/volk_32f_s32f_multiply_32f_a_neonasm.s 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/kernels/volk/asm/neon/volk_32f_s32f_multiply_32f_a_neonasm.s 2018-03-26 22:52:55.000000000 +0000 @@ -0,0 +1,52 @@ +@ static inline void volk_32f_s32f_multiply_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); + .global volk_32f_s32f_multiply_32f_a_neonasm +volk_32f_s32f_multiply_32f_a_neonasm: + @ r0 - cVector: pointer to output array + @ r1 - aVector: pointer to input array 1 + @ r2 - bVector: pointer to input array 2 + @ r3 - num_points: number of items to process + + stmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12} @ prologue - save register states + + + @ quarter_points = num_points / 4 + movs r11, r3, lsr #2 + beq .loop2 @ if zero into quarterPoints + + @ number = quarter_points + mov r10, r3 + @ copy address of input vector + mov r4, r1 + @ copy address of output vector + mov r5, r0 + + @ load the scalar to a quad register + @ vmov.32 d2[0], r2 + @ The scalar might be in s0, not totally sure + vdup.32 q2, d0[0] + + @ this is giving fits. Current theory is hf has something to do with it + .loop1: + @ vld1.32 {q1}, [r4:128]! @ aVal + @ vmul.f32 q3, q1, q2 + @ vst1.32 {q3}, [r5:128]! @ cVal + @ + @ subs r10, r10, #1 + @ bne .loop1 @ first loop + + @ number = quarter_points * 4 + mov r10, r11, asl #2 + + .loop2: + @ cmp num_points, number + @ bls .done + @ + @ vld1.32 {d0[0]}, [aVector]! + @ vmul.f32 s2, s0, s4 + @ vst1.32 {d1[0]}, [cVector]! + @ add number, number, #1 + @ b .loop2 + +.done: + ldmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12} @ epilogue - restore register states + bx lr diff -Nru volk-1.3/kernels/volk/asm/neon/volk_32f_s32f_multiply_32f_neonasm.s volk-1.4/kernels/volk/asm/neon/volk_32f_s32f_multiply_32f_neonasm.s --- volk-1.3/kernels/volk/asm/neon/volk_32f_s32f_multiply_32f_neonasm.s 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/asm/neon/volk_32f_s32f_multiply_32f_neonasm.s 1970-01-01 00:00:00.000000000 +0000 @@ -1,52 +0,0 @@ -@ static inline void volk_32f_s32f_multiply_32f_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); - .global volk_32f_s32f_multiply_32f_neonasm -volk_32f_s32f_multiply_32f_neonasm: - @ r0 - cVector: pointer to output array - @ r1 - aVector: pointer to input array 1 - @ r2 - bVector: pointer to input array 2 - @ r3 - num_points: number of items to process - - stmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12} @ prologue - save register states - - - @ quarter_points = num_points / 4 - movs r11, r3, lsr #2 - beq .loop2 @ if zero into quarterPoints - - @ number = quarter_points - mov r10, r3 - @ copy address of input vector - mov r4, r1 - @ copy address of output vector - mov r5, r0 - - @ load the scalar to a quad register - @ vmov.32 d2[0], r2 - @ The scalar might be in s0, not totally sure - vdup.32 q2, d0[0] - - @ this is giving fits. Current theory is hf has something to do with it - .loop1: - @ vld1.32 {q1}, [r4:128]! @ aVal - @ vmul.f32 q3, q1, q2 - @ vst1.32 {q3}, [r5:128]! @ cVal - @ - @ subs r10, r10, #1 - @ bne .loop1 @ first loop - - @ number = quarter_points * 4 - mov r10, r11, asl #2 - - .loop2: - @ cmp num_points, number - @ bls .done - @ - @ vld1.32 {d0[0]}, [aVector]! - @ vmul.f32 s2, s0, s4 - @ vst1.32 {d1[0]}, [cVector]! - @ add number, number, #1 - @ b .loop2 - -.done: - ldmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12} @ epilogue - restore register states - bx lr diff -Nru volk-1.3/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonasm.s volk-1.4/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonasm.s --- volk-1.3/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonasm.s 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonasm.s 2018-03-26 22:52:55.000000000 +0000 @@ -0,0 +1,54 @@ +@ static inline void volk_32f_x2_add_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); + .global volk_32f_x2_add_32f_a_neonasm +volk_32f_x2_add_32f_a_neonasm: + @ r0 - cVector: pointer to output array + @ r1 - aVector: pointer to input array 1 + @ r2 - bVector: pointer to input array 2 + @ r3 - num_points: number of items to process + cVector .req r0 + aVector .req r1 + bVector .req r2 + num_points .req r3 + quarterPoints .req r7 + number .req r8 + aVal .req q0 @ d0-d1 + bVal .req q1 @ d2-d3 + cVal .req q2 @ d4-d5 + + @ AAPCS Section 5.1.1 + @ A subroutine must preserve the contents of the registers r4-r8, r10, r11 and SP + stmfd sp!, {r7, r8, sl} @ prologue - save register states + + movs quarterPoints, num_points, lsr #2 + beq .loop2 @ if zero into quarterPoints + + mov number, #0 @ number, 0 +.loop1: + pld [aVector, #128] @ pre-load hint - this is implementation specific! + pld [bVector, #128] @ pre-load hint - this is implementation specific! + + vld1.32 {d0-d1}, [aVector:128]! @ aVal + add number, number, #1 + vld1.32 {d2-d3}, [bVector:128]! @ bVal + vadd.f32 cVal, bVal, aVal + cmp number, quarterPoints + vst1.32 {d4-d5}, [cVector:128]! @ cVal + + blt .loop1 @ first loop + + mov number, quarterPoints, asl #2 + +.loop2: + cmp num_points, number + bls .done + + vld1.32 {d0[0]}, [aVector]! + vld1.32 {d0[1]}, [bVector]! + vadd.f32 s2, s1, s0 + vst1.32 {d1[0]}, [cVector]! + add number, number, #1 + b .loop2 + +.done: + ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states + bx lr diff -Nru volk-1.3/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonpipeline.s volk-1.4/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonpipeline.s --- volk-1.3/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonpipeline.s 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonpipeline.s 2018-03-26 22:52:55.000000000 +0000 @@ -0,0 +1,65 @@ +@ static inline void volk_32f_x2_add_32f_a_neonpipeline(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); + .global volk_32f_x2_add_32f_a_neonpipeline +volk_32f_x2_add_32f_a_neonpipeline: + @ r0 - cVector: pointer to output array + @ r1 - aVector: pointer to input array 1 + @ r2 - bVector: pointer to input array 2 + @ r3 - num_points: number of items to process + cVector .req r0 + aVector .req r1 + bVector .req r2 + num_points .req r3 + quarterPoints .req r7 + number .req r8 + aVal .req q0 @ d0-d1 + bVal .req q1 @ d2-d3 + cVal .req q2 @ d4-d5 + + stmfd sp!, {r7, r8, sl} @ prologue - save register states + + pld [aVector, #128] @ pre-load hint - this is implementation specific! + pld [bVector, #128] @ pre-load hint - this is implementation specific! + + movs quarterPoints, num_points, lsr #2 + beq .loop2 @ if zero into quarterPoints + + mov number, quarterPoints + + @ Optimizing for pipeline + vld1.32 {d0-d1}, [aVector:128]! @ aVal + vld1.32 {d2-d3}, [bVector:128]! @ bVal + subs number, number, #1 + beq .flushpipe + +.loop1: + pld [aVector, #128] @ pre-load hint - this is implementation specific! + pld [bVector, #128] @ pre-load hint - this is implementation specific! + vadd.f32 cVal, bVal, aVal + vld1.32 {d0-d1}, [aVector:128]! @ aVal + vld1.32 {d2-d3}, [bVector:128]! @ bVal + vst1.32 {d4-d5}, [cVector:128]! @ cVal + + subs number, number, #1 + bne .loop1 @ first loop + +.flushpipe: + @ One more time + vadd.f32 cVal, bVal, aVal + vst1.32 {d4-d5}, [cVector:128]! @ cVal + + mov number, quarterPoints, asl #2 + +.loop2: + cmp num_points, number + bls .done + + vld1.32 {d0[0]}, [aVector]! + vld1.32 {d0[1]}, [bVector]! + vadd.f32 s2, s1, s0 + vst1.32 {d1[0]}, [cVector]! + add number, number, #1 + b .loop2 + +.done: + ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states + bx lr diff -Nru volk-1.3/kernels/volk/asm/neon/volk_32f_x2_add_32f_neonasm.s volk-1.4/kernels/volk/asm/neon/volk_32f_x2_add_32f_neonasm.s --- volk-1.3/kernels/volk/asm/neon/volk_32f_x2_add_32f_neonasm.s 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/asm/neon/volk_32f_x2_add_32f_neonasm.s 1970-01-01 00:00:00.000000000 +0000 @@ -1,54 +0,0 @@ -@ static inline void volk_32f_x2_add_32f_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); - .global volk_32f_x2_add_32f_neonasm -volk_32f_x2_add_32f_neonasm: - @ r0 - cVector: pointer to output array - @ r1 - aVector: pointer to input array 1 - @ r2 - bVector: pointer to input array 2 - @ r3 - num_points: number of items to process - cVector .req r0 - aVector .req r1 - bVector .req r2 - num_points .req r3 - quarterPoints .req r7 - number .req r8 - aVal .req q0 @ d0-d1 - bVal .req q1 @ d2-d3 - cVal .req q2 @ d4-d5 - - @ AAPCS Section 5.1.1 - @ A subroutine must preserve the contents of the registers r4-r8, r10, r11 and SP - stmfd sp!, {r7, r8, sl} @ prologue - save register states - - movs quarterPoints, num_points, lsr #2 - beq .loop2 @ if zero into quarterPoints - - mov number, #0 @ number, 0 -.loop1: - pld [aVector, #128] @ pre-load hint - this is implementation specific! - pld [bVector, #128] @ pre-load hint - this is implementation specific! - - vld1.32 {d0-d1}, [aVector:128]! @ aVal - add number, number, #1 - vld1.32 {d2-d3}, [bVector:128]! @ bVal - vadd.f32 cVal, bVal, aVal - cmp number, quarterPoints - vst1.32 {d4-d5}, [cVector:128]! @ cVal - - blt .loop1 @ first loop - - mov number, quarterPoints, asl #2 - -.loop2: - cmp num_points, number - bls .done - - vld1.32 {d0[0]}, [aVector]! - vld1.32 {d0[1]}, [bVector]! - vadd.f32 s2, s1, s0 - vst1.32 {d1[0]}, [cVector]! - add number, number, #1 - b .loop2 - -.done: - ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states - bx lr diff -Nru volk-1.3/kernels/volk/asm/neon/volk_32f_x2_add_32f_neonpipeline.s volk-1.4/kernels/volk/asm/neon/volk_32f_x2_add_32f_neonpipeline.s --- volk-1.3/kernels/volk/asm/neon/volk_32f_x2_add_32f_neonpipeline.s 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/asm/neon/volk_32f_x2_add_32f_neonpipeline.s 1970-01-01 00:00:00.000000000 +0000 @@ -1,65 +0,0 @@ -@ static inline void volk_32f_x2_add_32f_neonpipeline(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); - .global volk_32f_x2_add_32f_neonpipeline -volk_32f_x2_add_32f_neonpipeline: - @ r0 - cVector: pointer to output array - @ r1 - aVector: pointer to input array 1 - @ r2 - bVector: pointer to input array 2 - @ r3 - num_points: number of items to process - cVector .req r0 - aVector .req r1 - bVector .req r2 - num_points .req r3 - quarterPoints .req r7 - number .req r8 - aVal .req q0 @ d0-d1 - bVal .req q1 @ d2-d3 - cVal .req q2 @ d4-d5 - - stmfd sp!, {r7, r8, sl} @ prologue - save register states - - pld [aVector, #128] @ pre-load hint - this is implementation specific! - pld [bVector, #128] @ pre-load hint - this is implementation specific! - - movs quarterPoints, num_points, lsr #2 - beq .loop2 @ if zero into quarterPoints - - mov number, quarterPoints - - @ Optimizing for pipeline - vld1.32 {d0-d1}, [aVector:128]! @ aVal - vld1.32 {d2-d3}, [bVector:128]! @ bVal - subs number, number, #1 - beq .flushpipe - -.loop1: - pld [aVector, #128] @ pre-load hint - this is implementation specific! - pld [bVector, #128] @ pre-load hint - this is implementation specific! - vadd.f32 cVal, bVal, aVal - vld1.32 {d0-d1}, [aVector:128]! @ aVal - vld1.32 {d2-d3}, [bVector:128]! @ bVal - vst1.32 {d4-d5}, [cVector:128]! @ cVal - - subs number, number, #1 - bne .loop1 @ first loop - -.flushpipe: - @ One more time - vadd.f32 cVal, bVal, aVal - vst1.32 {d4-d5}, [cVector:128]! @ cVal - - mov number, quarterPoints, asl #2 - -.loop2: - cmp num_points, number - bls .done - - vld1.32 {d0[0]}, [aVector]! - vld1.32 {d0[1]}, [bVector]! - vadd.f32 s2, s1, s0 - vst1.32 {d1[0]}, [cVector]! - add number, number, #1 - b .loop2 - -.done: - ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states - bx lr diff -Nru volk-1.3/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_a_neonasm_opts.s volk-1.4/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_a_neonasm_opts.s --- volk-1.3/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_a_neonasm_opts.s 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_a_neonasm_opts.s 2018-03-26 22:52:55.000000000 +0000 @@ -0,0 +1,116 @@ +@ static inline void volk_32f_x2_dot_prod_32f_a_neonasm_opts(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); + @ r0 = cVector + @ r1 = aVector + @ r2 = bVector + @ r3 = num_points + .global volk_32f_x2_dot_prod_32f_a_neonasm_opts +volk_32f_x2_dot_prod_32f_a_neonasm_opts: + push {r4, r5, r6, r7, r8, r9, r10, r11} + @ sixteenth_points = num_points / 16 + lsrs r8, r3, #4 + sub r13, r13, #16 @ subtracting 16 from stack pointer?, wat? + @ 0 out neon accumulators + veor q0, q3, q3 + veor q1, q3, q3 + veor q2, q3, q3 + veor q3, q3, q3 + beq .smallvector @ if less than 16 points skip main loop + mov r7, r2 @ copy input ptrs + mov r6, r1 @ copy input ptrs + mov r5, #0 @ loop counter +.mainloop: + vld4.32 {d16,d18,d20,d22}, [r6]! + add r5, r5, #1 @ inc loop counter + cmp r5, r8 @ loop counter < sixteenth_points? + vld4.32 {d24,d26,d28,d30}, [r7]! + vld4.32 {d17,d19,d21,d23}, [r6]! + vld4.32 {d25,d27,d29,d31}, [r7]! + vmla.f32 q3, q8, q12 + vmla.f32 q0, q13, q9 + vmla.f32 q1, q14, q10 + vmla.f32 q2, q15, q11 + bne .mainloop + lsl r12, r8, #6 @ r12=r8/64 + add r1, r1, r12 + add r2, r2, r12 +.smallvector: @ actually this can be skipped for small vectors + vadd.f32 q3, q3, q0 + lsl r8, r8, #4 @ sixteenth_points * 16 + cmp r3, r8 @ num_points < sixteenth_points*16? + vadd.f32 q2, q1, q2 + vadd.f32 q3, q2, q3 @ sum of 4 accumulators in to q3 + vadd.f32 s15, s12, s15 @ q3 is s12-s15, so reduce to a single float + vadd.f32 s15, s15, s13 + vadd.f32 s15, s15, s14 + bls .done @ if vector is multiple of 16 then finish + sbfx r11, r1, #2, #1 @ check alignment + rsb r9, r8, r3 + and r11, r11, #3 + mov r6, r1 + cmp r11, r9 + movcs r11, r9 + cmp r9, #3 + movls r11, r9 + cmp r11, #0 + beq .nothingtodo + mov r5, r2 + mov r12, r8 +.dlabel5: + add r12, r12, #1 + vldmia r6!, {s14} + rsb r4, r8, r12 + vldmia r5!, {s13} + cmp r4, r11 + vmla.f32 s15, s13, s14 + mov r7, r6 + mov r4, r5 + bcc .dlabel5 + cmp r9, r11 + beq .done +.dlabel8: + rsb r9, r11, r9 + lsr r8, r9, #2 + lsls r10, r8, #2 + beq .dlabel6 + lsl r6, r11, #2 + veor q8, q8, q8 + add r1, r1, r6 + add r6, r2, r6 + mov r5, #0 +.dlabel9: + add r5, r5, #1 + vld1.32 {d20-d21}, [r6]! + cmp r5, r8 + vld1.64 {d18-d19}, [r1 :64]! + vmla.f32 q8, q10, q9 + bcc .dlabel9 + vadd.f32 d16, d16, d17 + lsl r2, r10, #2 + veor q9, q9, q9 + add r7, r7, r2 + vpadd.f32 d6, d16, d16 + add r4, r4, r2 + cmp r9, r10 + add r12, r12, r10 + vadd.f32 s15, s15, s12 + beq .done +.dlabel6: + mov r2, r7 +.dlabel7: + add r12, r12, #1 + vldmia r2!, {s13} + cmp r3, r12 + vldmia r4!, {s14} + vmla.f32 s15, s13, s14 + bhi .dlabel7 +.done: + vstr s15, [r0] + add r13, r13, #16 + pop {r4, r5, r6, r7, r8, r9, r10, r11} + bx lr @ lr is the return address +.nothingtodo: + mov r12, r8 + mov r4, r2 + mov r7, r1 + b .dlabel8 + diff -Nru volk-1.3/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_a_neonasm.s volk-1.4/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_a_neonasm.s --- volk-1.3/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_a_neonasm.s 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_a_neonasm.s 2018-03-26 22:52:55.000000000 +0000 @@ -0,0 +1,58 @@ +@ static inline void volk_32f_x2_dot_prod_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); + .global volk_32f_x2_dot_prod_32f_a_neonasm +volk_32f_x2_dot_prod_32f_a_neonasm: + @ r0 - cVector: pointer to output array + @ r1 - aVector: pointer to input array 1 + @ r2 - bVector: pointer to input array 2 + @ r3 - num_points: number of items to process + cVector .req r0 + aVector .req r1 + bVector .req r2 + num_points .req r3 + quarterPoints .req r7 + number .req r8 + aVal .req q0 @ d0-d1 + bVal .req q1 @ d2-d3 + cVal .req q2 @ d4-d5 + + @ AAPCS Section 5.1.1 + @ A subroutine must preserve the contents of the registers r4-r8, r10, r11 and SP + stmfd sp!, {r7, r8, sl} @ prologue - save register states + + veor.32 q0, q0, q0 + movs quarterPoints, num_points, lsr #2 + beq .loop2 @ if zero into quarterPoints + + mov number, #0 @ number, 0 +.loop1: + pld [aVector, #128] @ pre-load hint - this is implementation specific! + pld [bVector, #128] @ pre-load hint - this is implementation specific! + + vld1.32 {q1}, [aVector:128]! @ aVal + vld1.32 {q2}, [bVector:128]! @ bVal + vmla.f32 q0, q1, q2 + + add number, number, #1 + cmp number, quarterPoints + blt .loop1 @ first loop + + @ strange order comes from trying to schedule instructions + vadd.f32 s0, s0, s1 + vadd.f32 s2, s2, s3 + mov number, quarterPoints, asl #2 + vadd.f32 s0, s0, s2 + +.loop2: + cmp num_points, number + bls .done + + vld1.32 {d1[0]}, [aVector]! + vld1.32 {d1[1]}, [bVector]! + vmla.f32 s0, s2, s3 + add number, number, #1 + b .loop2 + +.done: + vstr s0, [cVector] + ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states + bx lr diff -Nru volk-1.3/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_neonasm_opts.s volk-1.4/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_neonasm_opts.s --- volk-1.3/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_neonasm_opts.s 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_neonasm_opts.s 1970-01-01 00:00:00.000000000 +0000 @@ -1,116 +0,0 @@ -@ static inline void volk_32f_x2_dot_prod_32f_neonasm_opts(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); - @ r0 = cVector - @ r1 = aVector - @ r2 = bVector - @ r3 = num_points - .global volk_32f_x2_dot_prod_32f_neonasm_opts -volk_32f_x2_dot_prod_32f_neonasm_opts: - push {r4, r5, r6, r7, r8, r9, r10, r11} - @ sixteenth_points = num_points / 16 - lsrs r8, r3, #4 - sub r13, r13, #16 @ subtracting 16 from stack pointer?, wat? - @ 0 out neon accumulators - veor q0, q3, q3 - veor q1, q3, q3 - veor q2, q3, q3 - veor q3, q3, q3 - beq .smallvector @ if less than 16 points skip main loop - mov r7, r2 @ copy input ptrs - mov r6, r1 @ copy input ptrs - mov r5, #0 @ loop counter -.mainloop: - vld4.32 {d16,d18,d20,d22}, [r6]! - add r5, r5, #1 @ inc loop counter - cmp r5, r8 @ loop counter < sixteenth_points? - vld4.32 {d24,d26,d28,d30}, [r7]! - vld4.32 {d17,d19,d21,d23}, [r6]! - vld4.32 {d25,d27,d29,d31}, [r7]! - vmla.f32 q3, q8, q12 - vmla.f32 q0, q13, q9 - vmla.f32 q1, q14, q10 - vmla.f32 q2, q15, q11 - bne .mainloop - lsl r12, r8, #6 @ r12=r8/64 - add r1, r1, r12 - add r2, r2, r12 -.smallvector: @ actually this can be skipped for small vectors - vadd.f32 q3, q3, q0 - lsl r8, r8, #4 @ sixteenth_points * 16 - cmp r3, r8 @ num_points < sixteenth_points*16? - vadd.f32 q2, q1, q2 - vadd.f32 q3, q2, q3 @ sum of 4 accumulators in to q3 - vadd.f32 s15, s12, s15 @ q3 is s12-s15, so reduce to a single float - vadd.f32 s15, s15, s13 - vadd.f32 s15, s15, s14 - bls .done @ if vector is multiple of 16 then finish - sbfx r11, r1, #2, #1 @ check alignment - rsb r9, r8, r3 - and r11, r11, #3 - mov r6, r1 - cmp r11, r9 - movcs r11, r9 - cmp r9, #3 - movls r11, r9 - cmp r11, #0 - beq .nothingtodo - mov r5, r2 - mov r12, r8 -.dlabel5: - add r12, r12, #1 - vldmia r6!, {s14} - rsb r4, r8, r12 - vldmia r5!, {s13} - cmp r4, r11 - vmla.f32 s15, s13, s14 - mov r7, r6 - mov r4, r5 - bcc .dlabel5 - cmp r9, r11 - beq .done -.dlabel8: - rsb r9, r11, r9 - lsr r8, r9, #2 - lsls r10, r8, #2 - beq .dlabel6 - lsl r6, r11, #2 - veor q8, q8, q8 - add r1, r1, r6 - add r6, r2, r6 - mov r5, #0 -.dlabel9: - add r5, r5, #1 - vld1.32 {d20-d21}, [r6]! - cmp r5, r8 - vld1.64 {d18-d19}, [r1 :64]! - vmla.f32 q8, q10, q9 - bcc .dlabel9 - vadd.f32 d16, d16, d17 - lsl r2, r10, #2 - veor q9, q9, q9 - add r7, r7, r2 - vpadd.f32 d6, d16, d16 - add r4, r4, r2 - cmp r9, r10 - add r12, r12, r10 - vadd.f32 s15, s15, s12 - beq .done -.dlabel6: - mov r2, r7 -.dlabel7: - add r12, r12, #1 - vldmia r2!, {s13} - cmp r3, r12 - vldmia r4!, {s14} - vmla.f32 s15, s13, s14 - bhi .dlabel7 -.done: - vstr s15, [r0] - add r13, r13, #16 - pop {r4, r5, r6, r7, r8, r9, r10, r11} - bx lr @ lr is the return address -.nothingtodo: - mov r12, r8 - mov r4, r2 - mov r7, r1 - b .dlabel8 - diff -Nru volk-1.3/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_neonasm.s volk-1.4/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_neonasm.s --- volk-1.3/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_neonasm.s 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_neonasm.s 1970-01-01 00:00:00.000000000 +0000 @@ -1,58 +0,0 @@ -@ static inline void volk_32f_x2_dot_prod_32f_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); - .global volk_32f_x2_dot_prod_32f_neonasm -volk_32f_x2_dot_prod_32f_neonasm: - @ r0 - cVector: pointer to output array - @ r1 - aVector: pointer to input array 1 - @ r2 - bVector: pointer to input array 2 - @ r3 - num_points: number of items to process - cVector .req r0 - aVector .req r1 - bVector .req r2 - num_points .req r3 - quarterPoints .req r7 - number .req r8 - aVal .req q0 @ d0-d1 - bVal .req q1 @ d2-d3 - cVal .req q2 @ d4-d5 - - @ AAPCS Section 5.1.1 - @ A subroutine must preserve the contents of the registers r4-r8, r10, r11 and SP - stmfd sp!, {r7, r8, sl} @ prologue - save register states - - veor.32 q0, q0, q0 - movs quarterPoints, num_points, lsr #2 - beq .loop2 @ if zero into quarterPoints - - mov number, #0 @ number, 0 -.loop1: - pld [aVector, #128] @ pre-load hint - this is implementation specific! - pld [bVector, #128] @ pre-load hint - this is implementation specific! - - vld1.32 {q1}, [aVector:128]! @ aVal - vld1.32 {q2}, [bVector:128]! @ bVal - vmla.f32 q0, q1, q2 - - add number, number, #1 - cmp number, quarterPoints - blt .loop1 @ first loop - - @ strange order comes from trying to schedule instructions - vadd.f32 s0, s0, s1 - vadd.f32 s2, s2, s3 - mov number, quarterPoints, asl #2 - vadd.f32 s0, s0, s2 - -.loop2: - cmp num_points, number - bls .done - - vld1.32 {d1[0]}, [aVector]! - vld1.32 {d1[1]}, [bVector]! - vmla.f32 s0, s2, s3 - add number, number, #1 - b .loop2 - -.done: - vstr s0, [cVector] - ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states - bx lr diff -Nru volk-1.3/kernels/volk/volk_16ic_convert_32fc.h volk-1.4/kernels/volk/volk_16ic_convert_32fc.h --- volk-1.3/kernels/volk/volk_16ic_convert_32fc.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_16ic_convert_32fc.h 2018-03-26 22:52:55.000000000 +0000 @@ -198,7 +198,7 @@ for(number = 0; number < sse_iters; number++) { a16x4 = vld1_s16((const int16_t*)_in); - __builtin_prefetch(_in + 4); + __VOLK_PREFETCH(_in + 4); a32x4 = vmovl_s16(a16x4); f32x4 = vcvtq_f32_s32(a32x4); vst1q_f32((float32_t*)_out, f32x4); diff -Nru volk-1.3/kernels/volk/volk_16ic_x2_dot_prod_16ic.h volk-1.4/kernels/volk/volk_16ic_x2_dot_prod_16ic.h --- volk-1.3/kernels/volk/volk_16ic_x2_dot_prod_16ic.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_16ic_x2_dot_prod_16ic.h 2018-03-26 22:52:55.000000000 +0000 @@ -89,16 +89,16 @@ realcacc = _mm_setzero_si128(); imagcacc = _mm_setzero_si128(); - mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); - mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255); + mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); + mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); for(number = 0; number < sse_iters; number++) { // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - __builtin_prefetch(_in_a + 8); + __VOLK_PREFETCH(_in_a + 8); b = _mm_load_si128((__m128i*)_in_b); - __builtin_prefetch(_in_b + 8); + __VOLK_PREFETCH(_in_b + 8); c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. @@ -166,16 +166,16 @@ realcacc = _mm_setzero_si128(); imagcacc = _mm_setzero_si128(); - mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); - mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255); + mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); + mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); for(number = 0; number < sse_iters; number++) { // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - __builtin_prefetch(_in_a + 8); + __VOLK_PREFETCH(_in_a + 8); b = _mm_loadu_si128((__m128i*)_in_b); - __builtin_prefetch(_in_b + 8); + __VOLK_PREFETCH(_in_b + 8); c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. @@ -242,15 +242,15 @@ realcacc = _mm256_setzero_si256(); imagcacc = _mm256_setzero_si256(); - mask_imag = _mm256_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); - mask_real = _mm256_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255); + mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); + mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); for(number = 0; number < avx_iters; number++) { a = _mm256_loadu_si256((__m256i*)_in_a); - __builtin_prefetch(_in_a + 16); + __VOLK_PREFETCH(_in_a + 16); b = _mm256_loadu_si256((__m256i*)_in_b); - __builtin_prefetch(_in_b + 16); + __VOLK_PREFETCH(_in_b + 16); c = _mm256_mullo_epi16(a, b); c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. @@ -318,15 +318,15 @@ realcacc = _mm256_setzero_si256(); imagcacc = _mm256_setzero_si256(); - mask_imag = _mm256_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); - mask_real = _mm256_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255); + mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); + mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); for(number = 0; number < avx_iters; number++) { a = _mm256_load_si256((__m256i*)_in_a); - __builtin_prefetch(_in_a + 16); + __VOLK_PREFETCH(_in_a + 16); b = _mm256_load_si256((__m256i*)_in_b); - __builtin_prefetch(_in_b + 16); + __VOLK_PREFETCH(_in_b + 16); c = _mm256_mullo_epi16(a, b); c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. @@ -399,8 +399,8 @@ { a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __builtin_prefetch(a_ptr + 8); - __builtin_prefetch(b_ptr + 8); + __VOLK_PREFETCH(a_ptr + 8); + __VOLK_PREFETCH(b_ptr + 8); // multiply the real*real and imag*imag to get real result // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r @@ -465,8 +465,8 @@ { a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __builtin_prefetch(a_ptr + 8); - __builtin_prefetch(b_ptr + 8); + __VOLK_PREFETCH(a_ptr + 8); + __VOLK_PREFETCH(b_ptr + 8); tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]); tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]); @@ -519,8 +519,8 @@ { a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __builtin_prefetch(a_ptr + 8); - __builtin_prefetch(b_ptr + 8); + __VOLK_PREFETCH(a_ptr + 8); + __VOLK_PREFETCH(b_ptr + 8); // use 2 accumulators to remove inter-instruction data dependencies accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]); diff -Nru volk-1.3/kernels/volk/volk_16ic_x2_multiply_16ic.h volk-1.4/kernels/volk/volk_16ic_x2_multiply_16ic.h --- volk-1.3/kernels/volk/volk_16ic_x2_multiply_16ic.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_16ic_x2_multiply_16ic.h 2018-03-26 22:52:55.000000000 +0000 @@ -71,8 +71,8 @@ const unsigned int sse_iters = num_points / 4; __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, result; - mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); - mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255); + mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); + mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); const lv_16sc_t* _in_a = in_a; const lv_16sc_t* _in_b = in_b; @@ -123,8 +123,8 @@ const unsigned int sse_iters = num_points / 4; __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result; - mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); - mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255); + mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); + mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); const lv_16sc_t* _in_a = in_a; const lv_16sc_t* _in_b = in_b; @@ -181,8 +181,8 @@ __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result; - const __m256i mask_imag = _mm256_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); - const __m256i mask_real = _mm256_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255); + const __m256i mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); + const __m256i mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); for(;number < avx2_points; number++) { @@ -235,8 +235,8 @@ __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result; - const __m256i mask_imag = _mm256_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); - const __m256i mask_real = _mm256_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255); + const __m256i mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); + const __m256i mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); for(;number < avx2_points; number++) { @@ -291,8 +291,8 @@ { a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __builtin_prefetch(a_ptr + 4); - __builtin_prefetch(b_ptr + 4); + __VOLK_PREFETCH(a_ptr + 4); + __VOLK_PREFETCH(b_ptr + 4); // multiply the real*real and imag*imag to get real result // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r diff -Nru volk-1.3/kernels/volk/volk_16i_max_star_16i.h volk-1.4/kernels/volk/volk_16i_max_star_16i.h --- volk-1.3/kernels/volk/volk_16i_max_star_16i.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_16i_max_star_16i.h 2018-03-26 22:52:55.000000000 +0000 @@ -139,7 +139,7 @@ for(number=0; number < eighth_points; ++number) { input_vec = vld1q_s16(src0); - __builtin_prefetch(src0+16); + __VOLK_PREFETCH(src0+16); diff = vsubq_s16(candidate_vec, input_vec); comp1 = vcgeq_s16(diff, zeros); comp2 = vcltq_s16(diff, zeros); diff -Nru volk-1.3/kernels/volk/volk_16i_max_star_horizontal_16i.h volk-1.4/kernels/volk/volk_16i_max_star_horizontal_16i.h --- volk-1.3/kernels/volk/volk_16i_max_star_horizontal_16i.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_16i_max_star_horizontal_16i.h 2018-03-26 22:52:55.000000000 +0000 @@ -169,7 +169,7 @@ zeros = veorq_s16(zeros, zeros); for(number=0; number < eighth_points; ++number) { input_vec = vld2q_s16(src0); - //__builtin_prefetch(src0+16); + //__VOLK_PREFETCH(src0+16); diff = vsubq_s16(input_vec.val[0], input_vec.val[1]); comp1 = vcgeq_s16(diff, zeros); comp2 = vcltq_s16(diff, zeros); @@ -190,7 +190,7 @@ #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_NEON -extern void volk_16i_max_star_horizontal_16i_neonasm(int16_t* target, int16_t* src0, unsigned int num_points); +extern void volk_16i_max_star_horizontal_16i_a_neonasm(int16_t* target, int16_t* src0, unsigned int num_points); #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_GENERIC diff -Nru volk-1.3/kernels/volk/volk_16i_x4_quad_max_star_16i.h volk-1.4/kernels/volk/volk_16i_x4_quad_max_star_16i.h --- volk-1.3/kernels/volk/volk_16i_x4_quad_max_star_16i.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_16i_x4_quad_max_star_16i.h 2018-03-26 22:52:55.000000000 +0000 @@ -132,7 +132,7 @@ } - /*asm volatile + /*__VOLK_ASM __VOLK_VOLATILE ( "volk_16i_x4_quad_max_star_16i_a_sse2_L1:\n\t" "cmp $0, %[bound]\n\t" diff -Nru volk-1.3/kernels/volk/volk_16i_x5_add_quad_16i_x4.h volk-1.4/kernels/volk/volk_16i_x5_add_quad_16i_x4.h --- volk-1.3/kernels/volk/volk_16i_x5_add_quad_16i_x4.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_16i_x5_add_quad_16i_x4.h 2018-03-26 22:52:55.000000000 +0000 @@ -121,7 +121,7 @@ p_target2 += 1; p_target3 += 1; } - /*asm volatile + /*__VOLK_ASM __VOLK_VOLATILE ( ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1:\n\t" "cmp $0, %[bound]\n\t" diff -Nru volk-1.3/kernels/volk/volk_16u_byteswap.h volk-1.4/kernels/volk/volk_16u_byteswap.h --- volk-1.3/kernels/volk/volk_16u_byteswap.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_16u_byteswap.h 2018-03-26 22:52:55.000000000 +0000 @@ -256,7 +256,7 @@ uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67; uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67; - /* these magic numbers are used as byte-indeces in the LUT. + /* these magic numbers are used as byte-indices in the LUT. they are pre-computed to save time. A simple C program can calculate them; for example for lookup01: uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1}; diff -Nru volk-1.3/kernels/volk/volk_32f_64f_add_64f.h volk-1.4/kernels/volk/volk_32f_64f_add_64f.h --- volk-1.3/kernels/volk/volk_32f_64f_add_64f.h 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32f_64f_add_64f.h 2018-03-26 22:52:55.000000000 +0000 @@ -0,0 +1,203 @@ +/* -*- c++ -*- */ +/* + * Copyright 2018 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +/*! + * \page volk_32f_64f_add_64f + * + * \b Overview + * + * Multiplies two input double-precision doubleing point vectors together. + * + * c[i] = a[i] * b[i] + * + * Dispatcher Prototype + * \code + * void volk_32f_64f_add_64f(double* cVector, const double* aVector, const double* bVector, unsigned int num_points) + * \endcode + * + * \b Inputs + * \li aVector: First input vector. + * \li bVector: Second input vector. + * \li num_points: The number of values in both input vectors. + * + * \b Outputs + * \li cVector: The output vector. + * + * \b Example + * add elements of an increasing vector by those of a decreasing vector. + * \code + * int N = 10; + * unsigned int alignment = volk_get_alignment(); + * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment); + * double* decreasing = (double*)volk_malloc(sizeof(double)*N, alignment); + * double* out = (double*)volk_malloc(sizeof(double)*N, alignment); + * + * for(unsigned int ii = 0; ii < N; ++ii){ + * increasing[ii] = (double)ii; + * decreasing[ii] = 10.f - (double)ii; + * } + * + * volk_32f_64f_add_64f(out, increasing, decreasing, N); + * + * for(unsigned int ii = 0; ii < N; ++ii){ + * printf("out[%u] = %1.2F\n", ii, out[ii]); + * } + * + * volk_free(increasing); + * volk_free(decreasing); + * volk_free(out); + * \endcode + */ + +#ifndef INCLUDED_volk_32f_64f_add_64f_H +#define INCLUDED_volk_32f_64f_add_64f_H + +#include + + +#ifdef LV_HAVE_GENERIC + +static inline void +volk_32f_64f_add_64f_generic(double *cVector, const float *aVector, + const double *bVector, unsigned int num_points) +{ + double *cPtr = cVector; + const float *aPtr = aVector; + const double *bPtr = bVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + *cPtr++ = ((double)(*aPtr++)) + (*bPtr++); + } +} + +#endif /* LV_HAVE_GENERIC */ + +/* + * Unaligned versions + */ + + +#ifdef LV_HAVE_AVX + +#include +#include + +static inline void +volk_32f_64f_add_64f_u_avx(double *cVector, const float *aVector, + const double *bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int eighth_points = num_points / 8; + + double *cPtr = cVector; + const float *aPtr = aVector; + const double *bPtr = bVector; + + __m256 aVal; + __m128 aVal1, aVal2; + __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2; + for (; number < eighth_points; number++) { + + aVal = _mm256_loadu_ps(aPtr); + bVal1 = _mm256_loadu_pd(bPtr); + bVal2 = _mm256_loadu_pd(bPtr+4); + + aVal1 = _mm256_extractf128_ps(aVal, 0); + aVal2 = _mm256_extractf128_ps(aVal, 1); + + aDbl1 = _mm256_cvtps_pd(aVal1); + aDbl2 = _mm256_cvtps_pd(aVal2); + + cVal1 = _mm256_add_pd(aDbl1, bVal1); + cVal2 = _mm256_add_pd(aDbl2, bVal2); + + _mm256_storeu_pd(cPtr, cVal1); // Store the results back into the C container + _mm256_storeu_pd(cPtr+4, cVal2); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eighth_points * 8; + for (; number < num_points; number++) { + *cPtr++ = ((double)(*aPtr++)) + (*bPtr++); + } +} + +#endif /* LV_HAVE_AVX */ + + +#ifdef LV_HAVE_AVX + +#include +#include + +static inline void +volk_32f_64f_add_64f_a_avx(double *cVector, const float *aVector, + const double *bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int eighth_points = num_points / 8; + + double *cPtr = cVector; + const float *aPtr = aVector; + const double *bPtr = bVector; + + __m256 aVal; + __m128 aVal1, aVal2; + __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2; + for (; number < eighth_points; number++) { + + aVal = _mm256_load_ps(aPtr); + bVal1 = _mm256_load_pd(bPtr); + bVal2 = _mm256_load_pd(bPtr+4); + + aVal1 = _mm256_extractf128_ps(aVal, 0); + aVal2 = _mm256_extractf128_ps(aVal, 1); + + aDbl1 = _mm256_cvtps_pd(aVal1); + aDbl2 = _mm256_cvtps_pd(aVal2); + + cVal1 = _mm256_add_pd(aDbl1, bVal1); + cVal2 = _mm256_add_pd(aDbl2, bVal2); + + _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container + _mm256_store_pd(cPtr+4, cVal2); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eighth_points * 8; + for (; number < num_points; number++) { + *cPtr++ = ((double)(*aPtr++)) + (*bPtr++); + } +} + +#endif /* LV_HAVE_AVX */ + + + +#endif /* INCLUDED_volk_32f_64f_add_64f_u_H */ diff -Nru volk-1.3/kernels/volk/volk_32f_64f_multiply_64f.h volk-1.4/kernels/volk/volk_32f_64f_multiply_64f.h --- volk-1.3/kernels/volk/volk_32f_64f_multiply_64f.h 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32f_64f_multiply_64f.h 2018-03-26 22:52:55.000000000 +0000 @@ -0,0 +1,203 @@ +/* -*- c++ -*- */ +/* + * Copyright 2018 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +/*! + * \page volk_32f_64f_multiply_64f + * + * \b Overview + * + * Multiplies two input double-precision doubleing point vectors together. + * + * c[i] = a[i] * b[i] + * + * Dispatcher Prototype + * \code + * void volk_32f_64f_multiply_64f(double* cVector, const double* aVector, const double* bVector, unsigned int num_points) + * \endcode + * + * \b Inputs + * \li aVector: First input vector. + * \li bVector: Second input vector. + * \li num_points: The number of values in both input vectors. + * + * \b Outputs + * \li cVector: The output vector. + * + * \b Example + * Multiply elements of an increasing vector by those of a decreasing vector. + * \code + * int N = 10; + * unsigned int alignment = volk_get_alignment(); + * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment); + * double* decreasing = (double*)volk_malloc(sizeof(double)*N, alignment); + * double* out = (double*)volk_malloc(sizeof(double)*N, alignment); + * + * for(unsigned int ii = 0; ii < N; ++ii){ + * increasing[ii] = (double)ii; + * decreasing[ii] = 10.f - (double)ii; + * } + * + * volk_32f_64f_multiply_64f(out, increasing, decreasing, N); + * + * for(unsigned int ii = 0; ii < N; ++ii){ + * printf("out[%u] = %1.2F\n", ii, out[ii]); + * } + * + * volk_free(increasing); + * volk_free(decreasing); + * volk_free(out); + * \endcode + */ + +#ifndef INCLUDED_volk_32f_64f_multiply_64f_H +#define INCLUDED_volk_32f_64f_multiply_64f_H + +#include + + +#ifdef LV_HAVE_GENERIC + +static inline void +volk_32f_64f_multiply_64f_generic(double *cVector, const float *aVector, + const double *bVector, unsigned int num_points) +{ + double *cPtr = cVector; + const float *aPtr = aVector; + const double *bPtr = bVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + *cPtr++ = ((double)(*aPtr++)) * (*bPtr++); + } +} + +#endif /* LV_HAVE_GENERIC */ + +/* + * Unaligned versions + */ + + +#ifdef LV_HAVE_AVX + +#include +#include + +static inline void +volk_32f_64f_multiply_64f_u_avx(double *cVector, const float *aVector, + const double *bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int eighth_points = num_points / 8; + + double *cPtr = cVector; + const float *aPtr = aVector; + const double *bPtr = bVector; + + __m256 aVal; + __m128 aVal1, aVal2; + __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2; + for (; number < eighth_points; number++) { + + aVal = _mm256_loadu_ps(aPtr); + bVal1 = _mm256_loadu_pd(bPtr); + bVal2 = _mm256_loadu_pd(bPtr+4); + + aVal1 = _mm256_extractf128_ps(aVal, 0); + aVal2 = _mm256_extractf128_ps(aVal, 1); + + aDbl1 = _mm256_cvtps_pd(aVal1); + aDbl2 = _mm256_cvtps_pd(aVal2); + + cVal1 = _mm256_mul_pd(aDbl1, bVal1); + cVal2 = _mm256_mul_pd(aDbl2, bVal2); + + _mm256_storeu_pd(cPtr, cVal1); // Store the results back into the C container + _mm256_storeu_pd(cPtr+4, cVal2); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eighth_points * 8; + for (; number < num_points; number++) { + *cPtr++ = ((double)(*aPtr++)) * (*bPtr++); + } +} + +#endif /* LV_HAVE_AVX */ + + +#ifdef LV_HAVE_AVX + +#include +#include + +static inline void +volk_32f_64f_multiply_64f_a_avx(double *cVector, const float *aVector, + const double *bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int eighth_points = num_points / 8; + + double *cPtr = cVector; + const float *aPtr = aVector; + const double *bPtr = bVector; + + __m256 aVal; + __m128 aVal1, aVal2; + __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2; + for (; number < eighth_points; number++) { + + aVal = _mm256_load_ps(aPtr); + bVal1 = _mm256_load_pd(bPtr); + bVal2 = _mm256_load_pd(bPtr+4); + + aVal1 = _mm256_extractf128_ps(aVal, 0); + aVal2 = _mm256_extractf128_ps(aVal, 1); + + aDbl1 = _mm256_cvtps_pd(aVal1); + aDbl2 = _mm256_cvtps_pd(aVal2); + + cVal1 = _mm256_mul_pd(aDbl1, bVal1); + cVal2 = _mm256_mul_pd(aDbl2, bVal2); + + _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container + _mm256_store_pd(cPtr+4, cVal2); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eighth_points * 8; + for (; number < num_points; number++) { + *cPtr++ = ((double)(*aPtr++)) * (*bPtr++); + } +} + +#endif /* LV_HAVE_AVX */ + + + +#endif /* INCLUDED_volk_32f_64f_multiply_64f_u_H */ diff -Nru volk-1.3/kernels/volk/volk_32f_8u_polarbutterfly_32f.h volk-1.4/kernels/volk/volk_32f_8u_polarbutterfly_32f.h --- volk-1.3/kernels/volk/volk_32f_8u_polarbutterfly_32f.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32f_8u_polarbutterfly_32f.h 2018-03-26 22:52:55.000000000 +0000 @@ -76,8 +76,8 @@ static inline float llr_odd(const float la, const float lb) { - const float ala = fabs(la); - const float alb = fabs(lb); + const float ala = fabsf(la); + const float alb = fabsf(lb); return copysignf(1.0f, la) * copysignf(1.0f, lb) * (ala > alb ? alb : ala); } @@ -156,9 +156,10 @@ static inline void volk_32f_8u_polarbutterfly_32f_generic(float* llrs, unsigned char* u, - const int frame_size, const int frame_exp, + const int frame_exp, const int stage, const int u_num, const int row) { + const int frame_size = 0x01 << frame_exp; const int next_stage = stage + 1; const int half_stage_size = 0x01 << stage; @@ -189,10 +190,10 @@ if(frame_exp > next_stage){ unsigned char* u_half = u + frame_size; odd_xor_even_values(u_half, u, u_num); - volk_32f_8u_polarbutterfly_32f_generic(next_llrs, u_half, frame_size, frame_exp, next_stage, u_num, next_upper_row); + volk_32f_8u_polarbutterfly_32f_generic(next_llrs, u_half, frame_exp, next_stage, u_num, next_upper_row); even_u_values(u_half, u, u_num); - volk_32f_8u_polarbutterfly_32f_generic(next_llrs, u_half, frame_size, frame_exp, next_stage, u_num, next_lower_row); + volk_32f_8u_polarbutterfly_32f_generic(next_llrs, u_half, frame_exp, next_stage, u_num, next_lower_row); } *call_row_llr = llr_odd(*upper_right_llr_ptr, *lower_right_llr_ptr); @@ -200,22 +201,17 @@ #endif /* LV_HAVE_GENERIC */ + #ifdef LV_HAVE_AVX #include - -/* - * https://software.intel.com/sites/landingpage/IntrinsicsGuide/# - * lists '__m256 _mm256_loadu2_m128 (float const* hiaddr, float const* loaddr)'. - * But GCC 4.8.4 doesn't know about it. Or headers are missing or something. Anyway, it doesn't compile :( - * This is what I want: llr0 = _mm256_loadu2_m128(src_llr_ptr, src_llr_ptr + 8); - * also useful but missing: _mm256_set_m128(hi, lo) - */ +#include static inline void volk_32f_8u_polarbutterfly_32f_u_avx(float* llrs, unsigned char* u, - const int frame_size, const int frame_exp, + const int frame_exp, const int stage, const int u_num, const int row) { + const int frame_size = 0x01 << frame_exp; if(row % 2){ // for odd rows just do the only necessary calculation and return. const float* next_llrs = llrs + frame_size + row; *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]); @@ -224,7 +220,7 @@ const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row); if(max_stage_depth < 3){ // vectorized version needs larger vectors. - volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_size, frame_exp, stage, u_num, row); + volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row); return; } @@ -235,8 +231,6 @@ float* dst_llr_ptr; __m256 src0, src1, dst; - __m256 part0, part1; - __m256 llr0, llr1; if(row){ // not necessary for ZERO row. == first bit to be decoded. // first do bit combination for all stages @@ -256,13 +250,7 @@ src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size; dst_llr_ptr = llrs + max_stage_depth * frame_size + row; - const __m128i zeros = _mm_set1_epi8(0x00); - const __m128i sign_extract = _mm_set1_epi8(0x80); - const __m128i shuffle_mask0 = _mm_setr_epi8(0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x01, 0xff, 0xff, 0xff, 0x02, 0xff, 0xff, 0xff, 0x03); - const __m128i shuffle_mask1 = _mm_setr_epi8(0xff, 0xff, 0xff, 0x04, 0xff, 0xff, 0xff, 0x05, 0xff, 0xff, 0xff, 0x06, 0xff, 0xff, 0xff, 0x07); - __m128i fbits, sign_bits0, sign_bits1; - - __m256 sign_mask; + __m128i fbits; int p; for(p = 0; p < stage_size; p += 8){ @@ -270,29 +258,11 @@ fbits = _mm_loadu_si128((__m128i*) u_target); u_target += 8; - // prepare sign mask for correct +- - fbits = _mm_cmpgt_epi8(fbits, zeros); - fbits = _mm_and_si128(fbits, sign_extract); - sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0); - sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1); - - src0 = _mm256_loadu_ps(src_llr_ptr); src1 = _mm256_loadu_ps(src_llr_ptr + 8); src_llr_ptr += 16; - sign_mask = _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits0), 0x0); - sign_mask = _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1); - - // deinterleave values - part0 = _mm256_permute2f128_ps(src0, src1, 0x20); - part1 = _mm256_permute2f128_ps(src0, src1, 0x31); - llr0 = _mm256_shuffle_ps(part0, part1, 0x88); - llr1 = _mm256_shuffle_ps(part0, part1, 0xdd); - - // calculate result - llr0 = _mm256_xor_ps(llr0, sign_mask); - dst = _mm256_add_ps(llr0, llr1); + dst = _mm256_polar_fsign_add_llrs(src0, src1, fbits); _mm256_storeu_ps(dst_llr_ptr, dst); dst_llr_ptr += 8; @@ -303,9 +273,8 @@ } const int min_stage = stage > 2 ? stage : 2; - const __m256 sign_mask = _mm256_set1_ps(-0.0); - const __m256 abs_mask = _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff))); - __m256 sign; + + _mm256_zeroall(); // Important to clear cache! int el; while(min_stage < loop_stage){ @@ -317,16 +286,7 @@ src1 = _mm256_loadu_ps(src_llr_ptr); src_llr_ptr += 8; - // deinterleave values - part0 = _mm256_permute2f128_ps(src0, src1, 0x20); - part1 = _mm256_permute2f128_ps(src0, src1, 0x31); - llr0 = _mm256_shuffle_ps(part0, part1, 0x88); - llr1 = _mm256_shuffle_ps(part0, part1, 0xdd); - - // calculate result - sign = _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask)); - dst = _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask)); - dst = _mm256_or_ps(dst, sign); + dst = _mm256_polar_minsum_llrs(src0, src1); _mm256_storeu_ps(dst_llr_ptr, dst); dst_llr_ptr += 8; diff -Nru volk-1.3/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h volk-1.4/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h --- volk-1.3/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h 2018-03-26 22:52:55.000000000 +0000 @@ -100,7 +100,7 @@ unsigned int u_num = 0; for(; u_num < frame_size; u_num++){ - volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_size, frame_exp, 0, u_num, u_num); + volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, 0, u_num, u_num); u[u_num] = llrs[u_num] > 0 ? 0 : 1; } @@ -120,7 +120,7 @@ unsigned int u_num = 0; for(; u_num < frame_size; u_num++){ - volk_32f_8u_polarbutterfly_32f_u_avx(llrs, u, frame_size, frame_exp, 0, u_num, u_num); + volk_32f_8u_polarbutterfly_32f_u_avx(llrs, u, frame_exp, 0, u_num, u_num); u[u_num] = llrs[u_num] > 0 ? 0 : 1; } diff -Nru volk-1.3/kernels/volk/volk_32f_accumulator_s32f.h volk-1.4/kernels/volk/volk_32f_accumulator_s32f.h --- volk-1.3/kernels/volk/volk_32f_accumulator_s32f.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32f_accumulator_s32f.h 2018-03-26 22:52:55.000000000 +0000 @@ -65,10 +65,92 @@ #include #include -#include -#ifdef LV_HAVE_SSE +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32f_accumulator_s32f_a_avx(float* result, const float* inputBuffer, unsigned int num_points) +{ + float returnValue = 0; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + const float* aPtr = inputBuffer; + __VOLK_ATTR_ALIGNED(32) float tempBuffer[8]; + + __m256 accumulator = _mm256_setzero_ps(); + __m256 aVal = _mm256_setzero_ps(); + + for(;number < eighthPoints; number++){ + aVal = _mm256_load_ps(aPtr); + accumulator = _mm256_add_ps(accumulator, aVal); + aPtr += 8; + } + + _mm256_store_ps(tempBuffer, accumulator); + + returnValue = tempBuffer[0]; + returnValue += tempBuffer[1]; + returnValue += tempBuffer[2]; + returnValue += tempBuffer[3]; + returnValue += tempBuffer[4]; + returnValue += tempBuffer[5]; + returnValue += tempBuffer[6]; + returnValue += tempBuffer[7]; + + number = eighthPoints * 8; + for(;number < num_points; number++){ + returnValue += (*aPtr++); + } + *result = returnValue; +} +#endif /* LV_HAVE_AVX */ + + +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32f_accumulator_s32f_u_avx(float* result, const float* inputBuffer, unsigned int num_points) +{ + float returnValue = 0; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + const float* aPtr = inputBuffer; + __VOLK_ATTR_ALIGNED(32) float tempBuffer[8]; + + __m256 accumulator = _mm256_setzero_ps(); + __m256 aVal = _mm256_setzero_ps(); + + for(;number < eighthPoints; number++){ + aVal = _mm256_loadu_ps(aPtr); + accumulator = _mm256_add_ps(accumulator, aVal); + aPtr += 8; + } + + _mm256_store_ps(tempBuffer, accumulator); + + returnValue = tempBuffer[0]; + returnValue += tempBuffer[1]; + returnValue += tempBuffer[2]; + returnValue += tempBuffer[3]; + returnValue += tempBuffer[4]; + returnValue += tempBuffer[5]; + returnValue += tempBuffer[6]; + returnValue += tempBuffer[7]; + + number = eighthPoints * 8; + for(;number < num_points; number++){ + returnValue += (*aPtr++); + } + *result = returnValue; +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_SSE #include static inline void @@ -90,7 +172,7 @@ aPtr += 4; } - _mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container + _mm_store_ps(tempBuffer,accumulator); returnValue = tempBuffer[0]; returnValue += tempBuffer[1]; @@ -103,13 +185,47 @@ } *result = returnValue; } - #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_SSE +#include -#ifdef LV_HAVE_GENERIC +static inline void +volk_32f_accumulator_s32f_u_sse(float* result, const float* inputBuffer, unsigned int num_points) +{ + float returnValue = 0; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + const float* aPtr = inputBuffer; + __VOLK_ATTR_ALIGNED(16) float tempBuffer[4]; + + __m128 accumulator = _mm_setzero_ps(); + __m128 aVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + aVal = _mm_load_ps(aPtr); + accumulator = _mm_add_ps(accumulator, aVal); + aPtr += 4; + } + + _mm_store_ps(tempBuffer,accumulator); + + returnValue = tempBuffer[0]; + returnValue += tempBuffer[1]; + returnValue += tempBuffer[2]; + returnValue += tempBuffer[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + returnValue += (*aPtr++); + } + *result = returnValue; +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC static inline void volk_32f_accumulator_s32f_generic(float* result, const float* inputBuffer, unsigned int num_points) { @@ -122,7 +238,6 @@ } *result = returnValue; } - #endif /* LV_HAVE_GENERIC */ #endif /* INCLUDED_volk_32f_accumulator_s32f_a_H */ diff -Nru volk-1.3/kernels/volk/volk_32f_acos_32f.h volk-1.4/kernels/volk/volk_32f_acos_32f.h --- volk-1.3/kernels/volk/volk_32f_acos_32f.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32f_acos_32f.h 2018-03-26 22:52:55.000000000 +0000 @@ -135,7 +135,7 @@ number = quarterPoints * 4; for(;number < num_points; number++){ - *bPtr++ = acos(*aPtr++); + *bPtr++ = acosf(*aPtr++); } } @@ -206,7 +206,7 @@ number = quarterPoints * 4; for(;number < num_points; number++){ - *bPtr++ = acos(*aPtr++); + *bPtr++ = acosf(*aPtr++); } } @@ -222,7 +222,7 @@ unsigned int number = 0; for(number = 0; number < num_points; number++){ - *bPtr++ = acos(*aPtr++); + *bPtr++ = acosf(*aPtr++); } } diff -Nru volk-1.3/kernels/volk/volk_32f_asin_32f.h volk-1.4/kernels/volk/volk_32f_asin_32f.h --- volk-1.3/kernels/volk/volk_32f_asin_32f.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32f_asin_32f.h 2018-03-26 22:52:55.000000000 +0000 @@ -133,7 +133,7 @@ number = quarterPoints * 4; for(;number < num_points; number++){ - *bPtr++ = asin(*aPtr++); + *bPtr++ = asinf(*aPtr++); } } @@ -200,7 +200,7 @@ number = quarterPoints * 4; for(;number < num_points; number++){ - *bPtr++ = asin(*aPtr++); + *bPtr++ = asinf(*aPtr++); } } @@ -216,7 +216,7 @@ unsigned int number = 0; for(number = 0; number < num_points; number++){ - *bPtr++ = asin(*aPtr++); + *bPtr++ = asinf(*aPtr++); } } #endif /* LV_HAVE_GENERIC */ diff -Nru volk-1.3/kernels/volk/volk_32f_atan_32f.h volk-1.4/kernels/volk/volk_32f_atan_32f.h --- volk-1.3/kernels/volk/volk_32f_atan_32f.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32f_atan_32f.h 2018-03-26 22:52:55.000000000 +0000 @@ -132,7 +132,7 @@ number = quarterPoints * 4; for(;number < num_points; number++){ - *bPtr++ = atan(*aPtr++); + *bPtr++ = atanf(*aPtr++); } } @@ -196,7 +196,7 @@ number = quarterPoints * 4; for(;number < num_points; number++){ - *bPtr++ = atan(*aPtr++); + *bPtr++ = atanf(*aPtr++); } } @@ -212,7 +212,7 @@ unsigned int number = 0; for(number = 0; number < num_points; number++){ - *bPtr++ = atan(*aPtr++); + *bPtr++ = atanf(*aPtr++); } } #endif /* LV_HAVE_GENERIC */ diff -Nru volk-1.3/kernels/volk/volk_32fc_32f_add_32fc.h volk-1.4/kernels/volk/volk_32fc_32f_add_32fc.h --- volk-1.3/kernels/volk/volk_32fc_32f_add_32fc.h 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32fc_32f_add_32fc.h 2018-03-26 22:52:55.000000000 +0000 @@ -0,0 +1,238 @@ +/* -*- c++ -*- */ +/* + * Copyright 2018 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +/*! + * \page volk_32fc_32f_add_32fcc + * + * \b Overview + * + * Adds two vectors together element by element: + * + * c[i] = a[i] + b[i] + * + * Dispatcher Prototype + * \code + * void volk_32fc_32f_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points) + * \endcode + * + * \b Inputs + * \li aVector: First vector of input points. + * \li bVector: Second vector of input points. + * \li num_points: The number of values in both input vector. + * + * \b Outputs + * \li cVector: The output vector. + * + * \b Example + * + * The follow example adds the increasing and decreasing vectors such that the result of every summation pair is 10 + * + * \code + * int N = 10; + * unsigned int alignment = volk_get_alignment(); + * lv_32fc_t* increasing = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment); + * lv_32fc_t* decreasing = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment); + * lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment); + * + * for(unsigned int ii = 0; ii < N; ++ii){ + * increasing[ii] = (lv_32fc_t)ii; + * decreasing[ii] = 10.f - (lv_32fc_t)ii; + * } + * + * volk_32fc_32f_add_32fc(out, increasing, decreasing, N); + * + * for(unsigned int ii = 0; ii < N; ++ii){ + * printf("out[%u] = %1.2f\n", ii, out[ii]); + * } + * + * volk_free(increasing); + * volk_free(decreasing); + * volk_free(out); + * \endcode + */ + +#ifndef INCLUDED_volk_32fc_32f_add_32fc_u_H +#define INCLUDED_volk_32fc_32f_add_32fc_u_H + +#ifdef LV_HAVE_GENERIC + +static inline void +volk_32fc_32f_add_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, + const float* bVector, unsigned int num_points) +{ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32fc_32f_add_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, + const float* bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const float* bPtr= bVector; + + __m256 aVal1, aVal2, bVal, cVal1, cVal2; + __m256 cpx_b1, cpx_b2; + __m256 zero; + zero = _mm256_setzero_ps(); + __m256 tmp1, tmp2; + for(;number < eighthPoints; number++){ + + aVal1 = _mm256_loadu_ps((float *) aPtr); + aVal2 = _mm256_loadu_ps((float *) (aPtr+4)); + bVal = _mm256_loadu_ps(bPtr); + cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0 + cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0 + + tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0+(0x2<<4)); + tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1+(0x3<<4)); + + cVal1 = _mm256_add_ps(aVal1, tmp1); + cVal2 = _mm256_add_ps(aVal2, tmp2); + + _mm256_storeu_ps((float *) cPtr, cVal1); // Store the results back into the C container + _mm256_storeu_ps((float *) (cPtr+4), cVal2); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32fc_32f_add_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, + const float* bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const float* bPtr= bVector; + + __m256 aVal1, aVal2, bVal, cVal1, cVal2; + __m256 cpx_b1, cpx_b2; + __m256 zero; + zero = _mm256_setzero_ps(); + __m256 tmp1, tmp2; + for(;number < eighthPoints; number++){ + + aVal1 = _mm256_load_ps((float *) aPtr); + aVal2 = _mm256_load_ps((float *) (aPtr+4)); + bVal = _mm256_load_ps(bPtr); + cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0 + cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0 + + tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0+(0x2<<4)); + tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1+(0x3<<4)); + + cVal1 = _mm256_add_ps(aVal1, tmp1); + cVal2 = _mm256_add_ps(aVal2, tmp2); + + _mm256_store_ps((float *) cPtr, cVal1); // Store the results back into the C container + _mm256_store_ps((float *) (cPtr+4), cVal2); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_NEON +#include + +static inline void +volk_32fc_32f_add_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, + const float* bVector, unsigned int num_points) +{ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const float* bPtr = bVector; + + float32x4x4_t aVal0, aVal1; + float32x4x2_t bVal0, bVal1; + + const unsigned int sixteenthPoints = num_points / 16; + unsigned int number = 0; + for(; number < sixteenthPoints; number++){ + aVal0 = vld4q_f32((const float*)aPtr); + aPtr += 8; + aVal1 = vld4q_f32((const float*)aPtr); + aPtr += 8; + __VOLK_PREFETCH(aPtr+16); + + bVal0 = vld2q_f32((const float*)bPtr); + bPtr += 8; + bVal1 = vld2q_f32((const float*)bPtr); + bPtr += 8; + __VOLK_PREFETCH(bPtr+16); + + aVal0.val[0] = vaddq_f32(aVal0.val[0], bVal0.val[0]); + aVal0.val[2] = vaddq_f32(aVal0.val[2], bVal0.val[1]); + + aVal1.val[2] = vaddq_f32(aVal1.val[2], bVal1.val[1]); + aVal1.val[0] = vaddq_f32(aVal1.val[0], bVal1.val[0]); + + vst4q_f32((float*)(cPtr), aVal0); + cPtr += 8; + vst4q_f32((float*)(cPtr), aVal1); + cPtr += 8; + } + + for(number = sixteenthPoints * 16; number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_NEON */ + + +#endif /* INCLUDED_volk_32fc_32f_add_32fc_a_H */ diff -Nru volk-1.3/kernels/volk/volk_32fc_conjugate_32fc.h volk-1.4/kernels/volk/volk_32fc_conjugate_32fc.h --- volk-1.3/kernels/volk/volk_32fc_conjugate_32fc.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32fc_conjugate_32fc.h 2018-03-26 22:52:55.000000000 +0000 @@ -248,7 +248,7 @@ const lv_32fc_t* a = aVector; for(number=0; number < quarterPoints; number++){ - __builtin_prefetch(a+4); + __VOLK_PREFETCH(a+4); x = vld2q_f32((float*)a); // Load the complex data as ar,br,cr,dr; ai,bi,ci,di // xor the imaginary lane diff -Nru volk-1.3/kernels/volk/volk_32fc_convert_16ic.h volk-1.4/kernels/volk/volk_32fc_convert_16ic.h --- volk-1.3/kernels/volk/volk_32fc_convert_16ic.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32fc_convert_16ic.h 2018-03-26 22:52:55.000000000 +0000 @@ -75,7 +75,7 @@ { inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; - __builtin_prefetch(inputVectorPtr + 8); + __VOLK_PREFETCH(inputVectorPtr + 8); // Clip ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); @@ -128,7 +128,7 @@ { inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; - __builtin_prefetch(inputVectorPtr + 8); + __VOLK_PREFETCH(inputVectorPtr + 8); // Clip ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); @@ -184,7 +184,7 @@ { a = vld1q_f32((const float32_t*)(inputVectorPtr)); inputVectorPtr += 4; b = vld1q_f32((const float32_t*)(inputVectorPtr)); inputVectorPtr += 4; - __builtin_prefetch(inputVectorPtr + 8); + __VOLK_PREFETCH(inputVectorPtr + 8); ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val); diff -Nru volk-1.3/kernels/volk/volk_32fc_index_max_16u.h volk-1.4/kernels/volk/volk_32fc_index_max_16u.h --- volk-1.3/kernels/volk/volk_32fc_index_max_16u.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32fc_index_max_16u.h 2018-03-26 22:52:55.000000000 +0000 @@ -115,10 +115,9 @@ int i = 0; xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order! - xmm9 = xmm8 = _mm_setzero_si128(); + xmm9 = _mm_setzero_si128(); xmm10 = _mm_set_epi32(4, 4, 4, 4); xmm3 = _mm_setzero_ps(); - //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]); for(; i < bound; ++i) { diff -Nru volk-1.3/kernels/volk/volk_32fc_index_max_32u.h volk-1.4/kernels/volk/volk_32fc_index_max_32u.h --- volk-1.3/kernels/volk/volk_32fc_index_max_32u.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32fc_index_max_32u.h 2018-03-26 22:52:55.000000000 +0000 @@ -104,7 +104,7 @@ int i = 0; xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order! - xmm9 = xmm8 = _mm_setzero_si128(); + xmm9 = _mm_setzero_si128(); xmm10 = _mm_set_epi32(4, 4, 4, 4); xmm3 = _mm_setzero_ps(); diff -Nru volk-1.3/kernels/volk/volk_32f_cos_32f.h volk-1.4/kernels/volk/volk_32f_cos_32f.h --- volk-1.3/kernels/volk/volk_32f_cos_32f.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32f_cos_32f.h 2018-03-26 22:52:55.000000000 +0000 @@ -158,7 +158,7 @@ number = quarterPoints * 4; for(;number < num_points; number++){ - *bPtr++ = cos(*aPtr++); + *bPtr++ = cosf(*aPtr++); } } @@ -240,7 +240,7 @@ number = quarterPoints * 4; for(;number < num_points; number++){ - *bPtr++ = cos(*aPtr++); + *bPtr++ = cosf(*aPtr++); } } @@ -315,7 +315,7 @@ unsigned int number = 0; for(; number < num_points; number++){ - *bPtr++ = cos(*aPtr++); + *bPtr++ = cosf(*aPtr++); } } diff -Nru volk-1.3/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h volk-1.4/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h --- volk-1.3/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h 2018-03-26 22:52:55.000000000 +0000 @@ -35,7 +35,7 @@ * \b Inputs * \li complexFFTInput The complex data output from the FFT point. * \li normalizationFactor: This value is divided against all the input values before the power is calculated. - * \li rbw: The resolution bandwith of the fft spectrum + * \li rbw: The resolution bandwidth of the fft spectrum * \li num_points: The number of fft data points. * * \b Outputs diff -Nru volk-1.3/kernels/volk/volk_32fc_x2_add_32fc.h volk-1.4/kernels/volk/volk_32fc_x2_add_32fc.h --- volk-1.3/kernels/volk/volk_32fc_x2_add_32fc.h 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32fc_x2_add_32fc.h 2018-03-26 22:52:55.000000000 +0000 @@ -0,0 +1,280 @@ +/* -*- c++ -*- */ +/* + * Copyright 2018 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +/*! + * \page volk_32fc_x2_add_32fcc + * + * \b Overview + * + * Adds two vectors together element by element: + * + * c[i] = a[i] + b[i] + * + * Dispatcher Prototype + * \code + * void volk_32fc_x2_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points) + * \endcode + * + * \b Inputs + * \li aVector: First vector of input points. + * \li bVector: Second vector of input points. + * \li num_points: The number of values in both input vector. + * + * \b Outputs + * \li cVector: The output vector. + * + * \b Example + * + * The follow example adds the increasing and decreasing vectors such that the result of every summation pair is 10 + * + * \code + * int N = 10; + * unsigned int alignment = volk_get_alignment(); + * lv_32fc_t* increasing = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment); + * lv_32fc_t* decreasing = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment); + * lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment); + * + * for(unsigned int ii = 0; ii < N; ++ii){ + * increasing[ii] = (lv_32fc_t)ii; + * decreasing[ii] = 10.f - (lv_32fc_t)ii; + * } + * + * volk_32fc_x2_add_32fc(out, increasing, decreasing, N); + * + * for(unsigned int ii = 0; ii < N; ++ii){ + * printf("out[%u] = %1.2f\n", ii, out[ii]); + * } + * + * volk_free(increasing); + * volk_free(decreasing); + * volk_free(out); + * \endcode + */ + +#ifndef INCLUDED_volk_32fc_x2_add_32fc_u_H +#define INCLUDED_volk_32fc_x2_add_32fc_u_H + +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32fc_x2_add_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, + const lv_32fc_t* bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + + __m256 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm256_loadu_ps((float *) aPtr); + bVal = _mm256_loadu_ps((float *) bPtr); + + cVal = _mm256_add_ps(aVal, bVal); + + _mm256_storeu_ps((float *) cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_AVX */ + + +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32fc_x2_add_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, + const lv_32fc_t* bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + + __m256 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm256_load_ps((float*) aPtr); + bVal = _mm256_load_ps((float*) bPtr); + + cVal = _mm256_add_ps(aVal, bVal); + + _mm256_store_ps((float*) cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_AVX */ + + +#ifdef LV_HAVE_SSE +#include + +static inline void +volk_32fc_x2_add_32fc_u_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, + const lv_32fc_t* bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < halfPoints; number++){ + + aVal = _mm_loadu_ps((float *) aPtr); + bVal = _mm_loadu_ps((float *) bPtr); + + cVal = _mm_add_ps(aVal, bVal); + + _mm_storeu_ps((float*) cPtr, cVal); // Store the results back into the C container + + aPtr += 2; + bPtr += 2; + cPtr += 2; + } + + number = halfPoints * 2; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + + +#ifdef LV_HAVE_GENERIC + +static inline void +volk_32fc_x2_add_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, + const lv_32fc_t* bVector, unsigned int num_points) +{ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#ifdef LV_HAVE_SSE +#include + +static inline void +volk_32fc_x2_add_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < halfPoints; number++){ + aVal = _mm_load_ps((float *) aPtr); + bVal = _mm_load_ps((float *) bPtr); + + cVal = _mm_add_ps(aVal, bVal); + + _mm_store_ps((float *) cPtr,cVal); // Store the results back into the C container + + aPtr += 2; + bPtr += 2; + cPtr += 2; + } + + number = halfPoints * 2; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + + +#ifdef LV_HAVE_NEON +#include + +static inline void +volk_32fc_x2_add_32fc_u_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, + const lv_32fc_t* bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + float32x4_t aVal, bVal, cVal; + for(number=0; number < halfPoints; number++){ + // Load in to NEON registers + aVal = vld1q_f32((const float32_t*)(aPtr)); + bVal = vld1q_f32((const float32_t*)(bPtr)); + __VOLK_PREFETCH(aPtr+2); + __VOLK_PREFETCH(bPtr+2); + + // vector add + cVal = vaddq_f32(aVal, bVal); + // Store the results back into the C container + vst1q_f32((float*)(cPtr),cVal); + + aPtr += 2; // q uses quadwords, 4 lv_32fc_ts per vadd + bPtr += 2; + cPtr += 2; + } + + number = halfPoints * 2; // should be = num_points + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} + +#endif /* LV_HAVE_NEON */ + + +#endif /* INCLUDED_volk_32fc_x2_add_32fc_a_H */ diff -Nru volk-1.3/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h volk-1.4/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h --- volk-1.3/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h 2018-03-26 22:52:55.000000000 +0000 @@ -65,7 +65,6 @@ #ifdef LV_HAVE_GENERIC - static inline void volk_32fc_x2_conjugate_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { const unsigned int num_bytes = num_points*8; @@ -76,46 +75,107 @@ unsigned int n_2_ccomplex_blocks = num_bytes >> 4; unsigned int isodd = (num_bytes >> 3) &1; - - float sum0[2] = {0,0}; float sum1[2] = {0,0}; unsigned int i = 0; - for(i = 0; i < n_2_ccomplex_blocks; ++i) { - sum0[0] += in[0] * tp[0] + in[1] * tp[1]; sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; sum1[0] += in[2] * tp[2] + in[3] * tp[3]; sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; - in += 4; tp += 4; - } - res[0] = sum0[0] + sum1[0]; res[1] = sum0[1] + sum1[1]; - - for(i = 0; i < isodd; ++i) { + *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); + } +} +#endif /*LV_HAVE_GENERIC*/ - *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); +#ifdef LV_HAVE_AVX +#include +#include "volk/volk_avx_intrinsics.h" +static inline void +volk_32fc_x2_conjugate_dot_prod_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + + int quarter_points = num_points / 4; + __m256 avec, bvec, resultvec, sumvec; + sumvec = _mm256_set1_ps(0.f); + const float *a_p = (const float*) input; + const float *b_p = (const float*) taps; + + int qpoint; + for (qpoint = 0; qpoint < quarter_points; ++qpoint) { + avec = _mm256_loadu_ps(a_p); + bvec = _mm256_loadu_ps(b_p); + resultvec = _mm256_complexconjugatemul_ps(avec, bvec); + sumvec = _mm256_add_ps(sumvec, resultvec); + a_p += 8; + b_p += 8; } - /* - for(i = 0; i < num_bytes >> 3; ++i) { - *result += input[i] * conjf(taps[i]); + + __VOLK_ATTR_ALIGNED(32) lv_32fc_t tmp_result[4]; + _mm256_store_ps((float*)tmp_result, sumvec); + *result = tmp_result[0] + tmp_result[1]; + *result += tmp_result[2] + tmp_result[3]; + + int point; + for (point=quarter_points*4; point < num_points; ++point) { + float a_r = *a_p++; + float a_i = *a_p++; + float b_r = *a_p++; + float b_i = *b_p++; + *result += lv_cmake(a_r*b_r + a_i*b_i, a_r*-b_i + a_i*b_r); } - */ } +#endif /* LV_HAVE_AVX */ -#endif /*LV_HAVE_GENERIC*/ +#ifdef LV_HAVE_AVX +#include +#include "volk/volk_avx_intrinsics.h" +static inline void +volk_32fc_x2_conjugate_dot_prod_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + + int quarter_points = num_points / 4; + __m256 avec, bvec, resultvec, sumvec; + sumvec = _mm256_set1_ps(0.f); + const float *a_p = (const float*) input; + const float *b_p = (const float*) taps; + + int qpoint; + for (qpoint = 0; qpoint < quarter_points; ++qpoint) { + avec = _mm256_load_ps(a_p); + bvec = _mm256_load_ps(b_p); + resultvec = _mm256_complexconjugatemul_ps(avec, bvec); + sumvec = _mm256_add_ps(sumvec, resultvec); + + a_p += 8; + b_p += 8; + } + + __VOLK_ATTR_ALIGNED(32) lv_32fc_t tmp_result[4]; + _mm256_store_ps((float*)tmp_result, sumvec); + *result = tmp_result[0] + tmp_result[1]; + *result += tmp_result[2] + tmp_result[3]; + + int point; + for (point=quarter_points*4; point < num_points; ++point) { + float a_r = *a_p++; + float a_i = *a_p++; + float b_r = *a_p++; + float b_i = *b_p++; + *result += lv_cmake(a_r*b_r + a_i*b_i, a_r*-b_i + a_i*b_r); + } +} +#endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_SSE3 @@ -123,7 +183,6 @@ #include #include - static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { unsigned int num_bytes = num_points*8; @@ -219,8 +278,8 @@ for(number = 0; number < quarter_points; ++number) { a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __builtin_prefetch(a_ptr+8); - __builtin_prefetch(b_ptr+8); + __VOLK_PREFETCH(a_ptr+8); + __VOLK_PREFETCH(b_ptr+8); // do the first multiply tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); @@ -273,44 +332,26 @@ unsigned int n_2_ccomplex_blocks = num_bytes >> 4; unsigned int isodd = (num_bytes >> 3) &1; - - float sum0[2] = {0,0}; float sum1[2] = {0,0}; unsigned int i = 0; - for(i = 0; i < n_2_ccomplex_blocks; ++i) { - - sum0[0] += in[0] * tp[0] + in[1] * tp[1]; sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; sum1[0] += in[2] * tp[2] + in[3] * tp[3]; sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; - in += 4; tp += 4; - } - res[0] = sum0[0] + sum1[0]; res[1] = sum0[1] + sum1[1]; - - for(i = 0; i < isodd; ++i) { - - *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); - } - /* - for(i = 0; i < num_bytes >> 3; ++i) { - *result += input[i] * conjf(taps[i]); - } - */ } #endif /*LV_HAVE_GENERIC*/ @@ -318,17 +359,13 @@ #if LV_HAVE_SSE && LV_HAVE_64 - static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { const unsigned int num_bytes = num_points*8; __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; - - - - asm volatile + __VOLK_ASM __VOLK_VOLATILE ( "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t" "# const float *taps, unsigned num_bytes)\n\t" @@ -446,18 +483,11 @@ :"rax", "r8", "r9", "r10" ); - int getem = num_bytes % 16; - for(; getem > 0; getem -= 8) { - - *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1])); - } - - return; } #endif @@ -471,8 +501,7 @@ int bound = num_bytes >> 4; int leftovers = num_bytes % 16; - - asm volatile + __VOLK_ASM __VOLK_VOLATILE ( " #pushl %%ebp\n\t" " #movl %%esp, %%ebp\n\t" @@ -577,29 +606,11 @@ : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator) ); - - - - printf("%d, %d\n", leftovers, bound); - for(; leftovers > 0; leftovers -= 8) { - - *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)])); - } - - return; - - - - - - } - #endif /*LV_HAVE_SSE*/ - #endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H*/ diff -Nru volk-1.3/kernels/volk/volk_32fc_x2_divide_32fc.h volk-1.4/kernels/volk/volk_32fc_x2_divide_32fc.h --- volk-1.3/kernels/volk/volk_32fc_x2_divide_32fc.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32fc_x2_divide_32fc.h 2018-03-26 22:52:55.000000000 +0000 @@ -315,6 +315,55 @@ } #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_NEON +#include + +static inline void +volk_32fc_x2_divide_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, + const lv_32fc_t* bVector, unsigned int num_points) +{ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr = bVector; + + float32x4x2_t aVal, bVal, cVal; + float32x4_t bAbs, bAbsInv; + + const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + for(; number < quarterPoints; number++){ + aVal = vld2q_f32((const float*)(aPtr)); + bVal = vld2q_f32((const float*)(bPtr)); + aPtr += 4; + bPtr += 4; + __VOLK_PREFETCH(aPtr+4); + __VOLK_PREFETCH(bPtr+4); + + bAbs = vmulq_f32( bVal.val[0], bVal.val[0]); + bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]); + + bAbsInv = vrecpeq_f32(bAbs); + bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs)); + bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs)); + + cVal.val[0] = vmulq_f32( aVal.val[0], bVal.val[0]); + cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]); + cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv); + + cVal.val[1] = vmulq_f32( aVal.val[1], bVal.val[0]); + cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]); + cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv); + + vst2q_f32((float*)(cPtr), cVal); + cPtr += 4; + } + + for(number = quarterPoints * 4; number < num_points; number++){ + *cPtr++ = (*aPtr++) / (*bPtr++); + } +} +#endif /* LV_HAVE_NEON */ + #ifdef LV_HAVE_GENERIC diff -Nru volk-1.3/kernels/volk/volk_32fc_x2_dot_prod_32fc.h volk-1.4/kernels/volk/volk_32fc_x2_dot_prod_32fc.h --- volk-1.3/kernels/volk/volk_32fc_x2_dot_prod_32fc.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32fc_x2_dot_prod_32fc.h 2018-03-26 22:52:55.000000000 +0000 @@ -109,7 +109,7 @@ const unsigned int num_bytes = num_points*8; unsigned int isodd = num_points & 1; - asm + __VOLK_ASM ( "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" "# const float *taps, unsigned num_bytes)\n\t" @@ -488,7 +488,7 @@ const unsigned int num_bytes = num_points*8; unsigned int isodd = num_points & 1; - asm + __VOLK_ASM ( "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" "# const float *taps, unsigned num_bytes)\n\t" @@ -622,7 +622,7 @@ const unsigned int num_bytes = num_points*8; unsigned int isodd = num_points & 1; - asm volatile + __VOLK_ASM __VOLK_VOLATILE ( " #pushl %%ebp\n\t" " #movl %%esp, %%ebp\n\t" @@ -894,8 +894,8 @@ for(number = 0; number < quarter_points; ++number) { a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __builtin_prefetch(a_ptr+8); - __builtin_prefetch(b_ptr+8); + __VOLK_PREFETCH(a_ptr+8); + __VOLK_PREFETCH(b_ptr+8); // multiply the real*real and imag*imag to get real result // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r @@ -949,8 +949,8 @@ for(number = 0; number < quarter_points; ++number) { a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __builtin_prefetch(a_ptr+8); - __builtin_prefetch(b_ptr+8); + __VOLK_PREFETCH(a_ptr+8); + __VOLK_PREFETCH(b_ptr+8); // do the first multiply tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); @@ -998,8 +998,8 @@ for(number = 0; number < quarter_points; ++number) { a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __builtin_prefetch(a_ptr+8); - __builtin_prefetch(b_ptr+8); + __VOLK_PREFETCH(a_ptr+8); + __VOLK_PREFETCH(b_ptr+8); // use 2 accumulators to remove inter-instruction data dependencies accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]); @@ -1050,8 +1050,8 @@ for(number = 0; number < quarter_points; ++number) { a_val = vld4q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i b_val = vld4q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __builtin_prefetch(a_ptr+8); - __builtin_prefetch(b_ptr+8); + __VOLK_PREFETCH(a_ptr+8); + __VOLK_PREFETCH(b_ptr+8); // use 2 accumulators to remove inter-instruction data dependencies accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]); diff -Nru volk-1.3/kernels/volk/volk_32fc_x2_multiply_32fc.h volk-1.4/kernels/volk/volk_32fc_x2_multiply_32fc.h --- volk-1.3/kernels/volk/volk_32fc_x2_multiply_32fc.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32fc_x2_multiply_32fc.h 2018-03-26 22:52:55.000000000 +0000 @@ -372,8 +372,8 @@ for(number = 0; number < quarter_points; ++number) { a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __builtin_prefetch(a_ptr+4); - __builtin_prefetch(b_ptr+4); + __VOLK_PREFETCH(a_ptr+4); + __VOLK_PREFETCH(b_ptr+4); // multiply the real*real and imag*imag to get real result // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r @@ -420,8 +420,8 @@ for(number = 0; number < quarter_points; ++number) { a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __builtin_prefetch(a_ptr+4); - __builtin_prefetch(b_ptr+4); + __VOLK_PREFETCH(a_ptr+4); + __VOLK_PREFETCH(b_ptr+4); // do the first multiply tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); @@ -449,7 +449,7 @@ #ifdef LV_HAVE_NEON extern void -volk_32fc_x2_multiply_32fc_neonasm(lv_32fc_t* cVector, const lv_32fc_t* aVector, +volk_32fc_x2_multiply_32fc_a_neonasm(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points); #endif /* LV_HAVE_NEON */ diff -Nru volk-1.3/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h volk-1.4/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h --- volk-1.3/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h 2018-03-26 22:52:55.000000000 +0000 @@ -262,8 +262,8 @@ a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i b_val.val[1] = vnegq_f32(b_val.val[1]); - __builtin_prefetch(a_ptr+4); - __builtin_prefetch(b_ptr+4); + __VOLK_PREFETCH(a_ptr+4); + __VOLK_PREFETCH(b_ptr+4); // multiply the real*real and imag*imag to get real result // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r diff -Nru volk-1.3/kernels/volk/volk_32f_index_max_32u.h volk-1.4/kernels/volk/volk_32f_index_max_32u.h --- volk-1.3/kernels/volk/volk_32f_index_max_32u.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32f_index_max_32u.h 2018-03-26 22:52:55.000000000 +0000 @@ -130,6 +130,69 @@ #endif /*LV_HAVE_SSE4_1*/ +#ifdef LV_HAVE_SSE4_1 +#include + +static inline void volk_32f_index_max_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points) +{ + if(num_points > 0) + { + uint32_t number = 0; + const uint32_t quarterPoints = num_points / 4; + + float* inputPtr = (float*)src0; + + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); + + float max = src0[0]; + float index = 0; + __m128 maxValues = _mm_set1_ps(max); + __m128 maxValuesIndex = _mm_setzero_ps(); + __m128 compareResults; + __m128 currentValues; + + __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; + + for(;number < quarterPoints; number++) + { + currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4; + currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); + compareResults = _mm_cmpgt_ps(maxValues, currentValues); + maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults); + maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults); + } + + // Calculate the largest value from the remaining 4 points + _mm_store_ps(maxValuesBuffer, maxValues); + _mm_store_ps(maxIndexesBuffer, maxValuesIndex); + + for(number = 0; number < 4; number++) + { + if(maxValuesBuffer[number] > max) + { + index = maxIndexesBuffer[number]; + max = maxValuesBuffer[number]; + } + } + + number = quarterPoints * 4; + for(;number < num_points; number++) + { + if(src0[number] > max) + { + index = number; + max = src0[number]; + } + } + target[0] = (uint32_t)index; + } +} + +#endif /*LV_HAVE_SSE4_1*/ + + #ifdef LV_HAVE_SSE #include @@ -193,6 +256,259 @@ #endif /*LV_HAVE_SSE*/ +#ifdef LV_HAVE_SSE +#include + +static inline void volk_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points) +{ + if(num_points > 0) + { + uint32_t number = 0; + const uint32_t quarterPoints = num_points / 4; + + float* inputPtr = (float*)src0; + + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); + + float max = src0[0]; + float index = 0; + __m128 maxValues = _mm_set1_ps(max); + __m128 maxValuesIndex = _mm_setzero_ps(); + __m128 compareResults; + __m128 currentValues; + + __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; + + for(;number < quarterPoints; number++) + { + currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4; + currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); + compareResults = _mm_cmpgt_ps(maxValues, currentValues); + maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes)); + maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues)); + } + + // Calculate the largest value from the remaining 4 points + _mm_store_ps(maxValuesBuffer, maxValues); + _mm_store_ps(maxIndexesBuffer, maxValuesIndex); + + for(number = 0; number < 4; number++) + { + if(maxValuesBuffer[number] > max) + { + index = maxIndexesBuffer[number]; + max = maxValuesBuffer[number]; + } + } + + number = quarterPoints * 4; + for(;number < num_points; number++) + { + if(src0[number] > max) + { + index = number; + max = src0[number]; + } + } + target[0] = (uint32_t)index; + } +} + +#endif /*LV_HAVE_SSE*/ + + +#ifdef LV_HAVE_AVX +#include + +static inline void volk_32f_index_max_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points) +{ + if(num_points > 0) + { + uint32_t number = 0; + const uint32_t quarterPoints = num_points / 8; + + float* inputPtr = (float*)src0; + + __m256 indexIncrementValues = _mm256_set1_ps(8); + __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8); + + float max = src0[0]; + float index = 0; + __m256 maxValues = _mm256_set1_ps(max); + __m256 maxValuesIndex = _mm256_setzero_ps(); + __m256 compareResults; + __m256 currentValues; + + __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8]; + __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8]; + + for(;number < quarterPoints; number++) + { + currentValues = _mm256_load_ps(inputPtr); inputPtr += 8; + currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); + compareResults = _mm256_cmp_ps(maxValues, currentValues, 0x1e); + maxValuesIndex = _mm256_blendv_ps(currentIndexes, maxValuesIndex, compareResults); + maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults); + } + + // Calculate the largest value from the remaining 8 points + _mm256_store_ps(maxValuesBuffer, maxValues); + _mm256_store_ps(maxIndexesBuffer, maxValuesIndex); + + for(number = 0; number < 8; number++) + { + if(maxValuesBuffer[number] > max) + { + index = maxIndexesBuffer[number]; + max = maxValuesBuffer[number]; + } + } + + number = quarterPoints * 8; + for(;number < num_points; number++) + { + if(src0[number] > max) + { + index = number; + max = src0[number]; + } + } + target[0] = (uint32_t)index; + } +} + +#endif /*LV_HAVE_AVX*/ + + +#ifdef LV_HAVE_AVX +#include + +static inline void volk_32f_index_max_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points) +{ + if(num_points > 0) + { + uint32_t number = 0; + const uint32_t quarterPoints = num_points / 8; + + float* inputPtr = (float*)src0; + + __m256 indexIncrementValues = _mm256_set1_ps(8); + __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8); + + float max = src0[0]; + float index = 0; + __m256 maxValues = _mm256_set1_ps(max); + __m256 maxValuesIndex = _mm256_setzero_ps(); + __m256 compareResults; + __m256 currentValues; + + __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8]; + __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8]; + + for(;number < quarterPoints; number++) + { + currentValues = _mm256_loadu_ps(inputPtr); inputPtr += 8; + currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); + compareResults = _mm256_cmp_ps(maxValues, currentValues, 0x1e); + maxValuesIndex = _mm256_blendv_ps(currentIndexes, maxValuesIndex, compareResults); + maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults); + } + + // Calculate the largest value from the remaining 8 points + _mm256_store_ps(maxValuesBuffer, maxValues); + _mm256_store_ps(maxIndexesBuffer, maxValuesIndex); + + for(number = 0; number < 8; number++) + { + if(maxValuesBuffer[number] > max) + { + index = maxIndexesBuffer[number]; + max = maxValuesBuffer[number]; + } + } + + number = quarterPoints * 8; + for(;number < num_points; number++) + { + if(src0[number] > max) + { + index = number; + max = src0[number]; + } + } + target[0] = (uint32_t)index; + } +} + +#endif /*LV_HAVE_AVX*/ + + +#ifdef LV_HAVE_NEON +#include + +static inline void volk_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_points) +{ + if(num_points > 0) + { + uint32_t number = 0; + const uint32_t quarterPoints = num_points / 4; + + float* inputPtr = (float*)src0; + float32x4_t indexIncrementValues = vdupq_n_f32(4); + __VOLK_ATTR_ALIGNED(16) float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f }; + float32x4_t currentIndexes = vld1q_f32(currentIndexes_float); + + float max = src0[0]; + float index = 0; + float32x4_t maxValues = vdupq_n_f32(max); + uint32x4_t maxValuesIndex = vmovq_n_u32(0); + uint32x4_t compareResults; + uint32x4_t currentIndexes_u; + float32x4_t currentValues; + + __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; + + for(;number < quarterPoints; number++) + { + currentValues = vld1q_f32(inputPtr); inputPtr += 4; + currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues); + currentIndexes_u = vcvtq_u32_f32(currentIndexes); + compareResults = vcgtq_f32( maxValues, currentValues); + maxValuesIndex = vorrq_u32( vandq_u32( compareResults, maxValuesIndex ), vbicq_u32(currentIndexes_u, compareResults) ); + maxValues = vmaxq_f32(currentValues, maxValues); + } + + // Calculate the largest value from the remaining 4 points + vst1q_f32(maxValuesBuffer, maxValues); + vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex)); + for(number = 0; number < 4; number++) + { + if(maxValuesBuffer[number] > max) + { + index = maxIndexesBuffer[number]; + max = maxValuesBuffer[number]; + } + } + + number = quarterPoints * 4; + for(;number < num_points; number++) + { + if(src0[number] > max) + { + index = number; + max = src0[number]; + } + } + target[0] = (uint32_t)index; + } +} + +#endif /*LV_HAVE_NEON*/ + + #ifdef LV_HAVE_GENERIC static inline void diff -Nru volk-1.3/kernels/volk/volk_32f_log2_32f.h volk-1.4/kernels/volk/volk_32f_log2_32f.h --- volk-1.3/kernels/volk/volk_32f_log2_32f.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32f_log2_32f.h 2018-03-26 22:52:55.000000000 +0000 @@ -62,7 +62,7 @@ * \li num_points: The number of data points. * * \b Outputs - * \li cVector: The output vector. + * \li bVector: The output vector. * * \b Example * \code @@ -94,15 +94,6 @@ #include #include -#define POLY0(x, c0) _mm_set1_ps(c0) -#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0)) -#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0)) -#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) -#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) -#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) - -#define LOG_POLY_DEGREE 6 - #ifdef LV_HAVE_GENERIC static inline void @@ -119,10 +110,83 @@ } #endif /* LV_HAVE_GENERIC */ +#ifdef LV_HAVE_AVX2 +#include + +#define AVX_POLY0(x, c0) _mm256_set1_ps(c0) +#define AVX_POLY1(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(AVX_POLY0(x, c1), x), _mm256_set1_ps(c0)) +#define AVX_POLY2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(AVX_POLY1(x, c1, c2), x), _mm256_set1_ps(c0)) +#define AVX_POLY3(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(AVX_POLY2(x, c1, c2, c3), x), _mm256_set1_ps(c0)) +#define AVX_POLY4(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(AVX_POLY3(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0)) +#define AVX_POLY5(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(AVX_POLY4(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0)) + +#define AVX_LOG_POLY_DEGREE 6 + +static inline void +volk_32f_log2_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points) +{ + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 8; + + __m256 aVal, bVal, mantissa, frac, leadingOne; + __m256i bias, exp; + + for(;number < quarterPoints; number++){ + + aVal = _mm256_load_ps(aPtr); + bias = _mm256_set1_epi32(127); + leadingOne = _mm256_set1_ps(1.0f); + exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias); + bVal = _mm256_cvtepi32_ps(exp); + + // Now to extract mantissa + frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); + +#if AVX_LOG_POLY_DEGREE == 6 + mantissa = AVX_POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); +#elif AVX_LOG_POLY_DEGREE == 5 + mantissa = AVX_POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); +#elif AVX_LOG_POLY_DEGREE == 4 + mantissa = AVX_POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); +#elif AVX_LOG_POLY_DEGREE == 3 + mantissa = AVX_POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); +#else +#error +#endif + + bVal = _mm256_add_ps(bVal, _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne))); + _mm256_store_ps(bPtr, bVal); + + aPtr += 8; + bPtr += 8; + } + + number = quarterPoints * 8; + for(;number < num_points; number++){ + *bPtr++ = log2f(*aPtr++); + } +} + +#endif /* LV_HAVE_AVX2 for aligned */ + + + #ifdef LV_HAVE_SSE4_1 #include +#define POLY0(x, c0) _mm_set1_ps(c0) +#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0)) +#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0)) +#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) +#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) +#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) + +#define LOG_POLY_DEGREE 6 + static inline void volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points) { @@ -291,10 +355,81 @@ #endif /* LV_HAVE_GENERIC */ +#ifdef LV_HAVE_AVX2 +#include + +#define AVX_POLY0(x, c0) _mm256_set1_ps(c0) +#define AVX_POLY1(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(AVX_POLY0(x, c1), x), _mm256_set1_ps(c0)) +#define AVX_POLY2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(AVX_POLY1(x, c1, c2), x), _mm256_set1_ps(c0)) +#define AVX_POLY3(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(AVX_POLY2(x, c1, c2, c3), x), _mm256_set1_ps(c0)) +#define AVX_POLY4(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(AVX_POLY3(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0)) +#define AVX_POLY5(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(AVX_POLY4(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0)) + +#define AVX_LOG_POLY_DEGREE 6 + +static inline void +volk_32f_log2_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points) +{ + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 8; + + __m256 aVal, bVal, mantissa, frac, leadingOne; + __m256i bias, exp; + + for(;number < quarterPoints; number++){ + + aVal = _mm256_loadu_ps(aPtr); + bias = _mm256_set1_epi32(127); + leadingOne = _mm256_set1_ps(1.0f); + exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias); + bVal = _mm256_cvtepi32_ps(exp); + + // Now to extract mantissa + frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); + +#if AVX_LOG_POLY_DEGREE == 6 + mantissa = AVX_POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); +#elif AVX_LOG_POLY_DEGREE == 5 + mantissa = AVX_POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); +#elif AVX_LOG_POLY_DEGREE == 4 + mantissa = AVX_POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); +#elif AVX_LOG_POLY_DEGREE == 3 + mantissa = AVX_POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); +#else +#error +#endif + + bVal = _mm256_add_ps(bVal, _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne))); + _mm256_storeu_ps(bPtr, bVal); + + aPtr += 8; + bPtr += 8; + } + + number = quarterPoints * 8; + for(;number < num_points; number++){ + *bPtr++ = log2f(*aPtr++); + } +} + +#endif /* LV_HAVE_AVX2 for aligned */ + #ifdef LV_HAVE_SSE4_1 #include +#define POLY0(x, c0) _mm_set1_ps(c0) +#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0)) +#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0)) +#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) +#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) +#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) + +#define LOG_POLY_DEGREE 6 + static inline void volk_32f_log2_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points) { diff -Nru volk-1.3/kernels/volk/volk_32f_s32f_convert_16i.h volk-1.4/kernels/volk/volk_32f_s32f_convert_16i.h --- volk-1.3/kernels/volk/volk_32f_s32f_convert_16i.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32f_s32f_convert_16i.h 2018-03-26 22:52:55.000000000 +0000 @@ -73,6 +73,62 @@ #include #include +#ifdef LV_HAVE_AVX2 +#include + +static inline void +volk_32f_s32f_convert_16i_u_avx2(int16_t* outputVector, const float* inputVector, + const float scalar, unsigned int num_points) +{ + unsigned int number = 0; + + const unsigned int sixteenthPoints = num_points / 16; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + + float min_val = -32768; + float max_val = 32767; + float r; + + __m256 vScalar = _mm256_set1_ps(scalar); + __m256 inputVal1, inputVal2; + __m256i intInputVal1, intInputVal2; + __m256 ret1, ret2; + __m256 vmin_val = _mm256_set1_ps(min_val); + __m256 vmax_val = _mm256_set1_ps(max_val); + + for(;number < sixteenthPoints; number++){ + inputVal1 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8; + inputVal2 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8; + + // Scale and clip + ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm256_cvtps_epi32(ret1); + intInputVal2 = _mm256_cvtps_epi32(ret2); + + intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); + intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); + + _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)rintf(r); + } +} +#endif /* LV_HAVE_AVX2 */ + + #ifdef LV_HAVE_AVX #include @@ -269,6 +325,62 @@ #include #include +#ifdef LV_HAVE_AVX2 +#include + +static inline void +volk_32f_s32f_convert_16i_a_avx2(int16_t* outputVector, const float* inputVector, + const float scalar, unsigned int num_points) +{ + unsigned int number = 0; + + const unsigned int sixteenthPoints = num_points / 16; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + + float min_val = -32768; + float max_val = 32767; + float r; + + __m256 vScalar = _mm256_set1_ps(scalar); + __m256 inputVal1, inputVal2; + __m256i intInputVal1, intInputVal2; + __m256 ret1, ret2; + __m256 vmin_val = _mm256_set1_ps(min_val); + __m256 vmax_val = _mm256_set1_ps(max_val); + + for(;number < sixteenthPoints; number++){ + inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; + inputVal2 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; + + // Scale and clip + ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm256_cvtps_epi32(ret1); + intInputVal2 = _mm256_cvtps_epi32(ret2); + + intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); + intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); + + _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)rintf(r); + } +} +#endif /* LV_HAVE_AVX2 */ + + #ifdef LV_HAVE_AVX #include diff -Nru volk-1.3/kernels/volk/volk_32f_s32f_convert_8i.h volk-1.4/kernels/volk/volk_32f_s32f_convert_8i.h --- volk-1.3/kernels/volk/volk_32f_s32f_convert_8i.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32f_s32f_convert_8i.h 2018-03-26 22:52:55.000000000 +0000 @@ -74,6 +74,73 @@ #include #include +#ifdef LV_HAVE_AVX2 +#include + +static inline void +volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector, const float* inputVector, + const float scalar, unsigned int num_points) +{ + unsigned int number = 0; + + const unsigned int thirtysecondPoints = num_points / 32; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + + float min_val = -128; + float max_val = 127; + float r; + + __m256 vScalar = _mm256_set1_ps(scalar); + __m256 inputVal1, inputVal2, inputVal3, inputVal4; + __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + __m256 vmin_val = _mm256_set1_ps(min_val); + __m256 vmax_val = _mm256_set1_ps(max_val); + __m256i intInputVal; + + for(;number < thirtysecondPoints; number++){ + inputVal1 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8; + inputVal2 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8; + inputVal3 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8; + inputVal4 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8; + + inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + inputVal2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + inputVal3 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val); + inputVal4 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm256_cvtps_epi32(inputVal1); + intInputVal2 = _mm256_cvtps_epi32(inputVal2); + intInputVal3 = _mm256_cvtps_epi32(inputVal3); + intInputVal4 = _mm256_cvtps_epi32(inputVal4); + + intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); + intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); + intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4); + intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000); + + intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3); + intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); + + _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal); + outputVectorPtr += 32; + } + + number = thirtysecondPoints * 32; + for(; number < num_points; number++){ + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)(r); + } +} + +#endif /* LV_HAVE_AVX2 */ + + #ifdef LV_HAVE_SSE2 #include @@ -223,6 +290,73 @@ #include #include +#ifdef LV_HAVE_AVX2 +#include + +static inline void +volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector, const float* inputVector, + const float scalar, unsigned int num_points) +{ + unsigned int number = 0; + + const unsigned int thirtysecondPoints = num_points / 32; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + + float min_val = -128; + float max_val = 127; + float r; + + __m256 vScalar = _mm256_set1_ps(scalar); + __m256 inputVal1, inputVal2, inputVal3, inputVal4; + __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + __m256 vmin_val = _mm256_set1_ps(min_val); + __m256 vmax_val = _mm256_set1_ps(max_val); + __m256i intInputVal; + + for(;number < thirtysecondPoints; number++){ + inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; + inputVal2 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; + inputVal3 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; + inputVal4 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; + + inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + inputVal2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + inputVal3 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val); + inputVal4 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm256_cvtps_epi32(inputVal1); + intInputVal2 = _mm256_cvtps_epi32(inputVal2); + intInputVal3 = _mm256_cvtps_epi32(inputVal3); + intInputVal4 = _mm256_cvtps_epi32(inputVal4); + + intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); + intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); + intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4); + intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000); + + intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3); + intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); + + _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal); + outputVectorPtr += 32; + } + + number = thirtysecondPoints * 32; + for(; number < num_points; number++){ + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)(r); + } +} + +#endif /* LV_HAVE_AVX2 */ + + #ifdef LV_HAVE_SSE2 #include diff -Nru volk-1.3/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h volk-1.4/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h --- volk-1.3/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h 2018-03-26 22:52:55.000000000 +0000 @@ -0,0 +1,45 @@ +#ifndef INCLUDED_VOLK_32F_S32F_MOD_RANGEPUPPET_32F_H +#define INCLUDED_VOLK_32F_S32F_MOD_RANGEPUPPET_32F_H + +#include + +#ifdef LV_HAVE_GENERIC +static inline void volk_32f_s32f_mod_rangepuppet_32f_generic(float *output, const float *input, float bound, unsigned int num_points){ + volk_32f_s32f_s32f_mod_range_32f_generic(output, input, bound-3.141f, bound, num_points); +} +#endif + + +#ifdef LV_HAVE_SSE +static inline void volk_32f_s32f_mod_rangepuppet_32f_u_sse(float *output, const float *input, float bound, unsigned int num_points){ + volk_32f_s32f_s32f_mod_range_32f_u_sse(output, input, bound-3.141f, bound, num_points); +} +#endif +#ifdef LV_HAVE_SSE +static inline void volk_32f_s32f_mod_rangepuppet_32f_a_sse(float *output, const float *input, float bound, unsigned int num_points){ + volk_32f_s32f_s32f_mod_range_32f_a_sse(output, input, bound-3.141f, bound, num_points); +} +#endif + +#ifdef LV_HAVE_SSE2 +static inline void volk_32f_s32f_mod_rangepuppet_32f_u_sse2(float *output, const float *input, float bound, unsigned int num_points){ + volk_32f_s32f_s32f_mod_range_32f_u_sse2(output, input, bound-3.141f, bound, num_points); +} +#endif +#ifdef LV_HAVE_SSE2 +static inline void volk_32f_s32f_mod_rangepuppet_32f_a_sse2(float *output, const float *input, float bound, unsigned int num_points){ + volk_32f_s32f_s32f_mod_range_32f_a_sse2(output, input, bound-3.141f, bound, num_points); +} +#endif + +#ifdef LV_HAVE_AVX +static inline void volk_32f_s32f_mod_rangepuppet_32f_u_avx(float *output, const float *input, float bound, unsigned int num_points){ + volk_32f_s32f_s32f_mod_range_32f_u_avx(output, input, bound-3.141f, bound, num_points); +} +#endif +#ifdef LV_HAVE_AVX +static inline void volk_32f_s32f_mod_rangepuppet_32f_a_avx(float *output, const float *input, float bound, unsigned int num_points){ + volk_32f_s32f_s32f_mod_range_32f_a_avx(output, input, bound-3.141f, bound, num_points); +} +#endif +#endif diff -Nru volk-1.3/kernels/volk/volk_32f_s32f_normalize.h volk-1.4/kernels/volk/volk_32f_s32f_normalize.h --- volk-1.3/kernels/volk/volk_32f_s32f_normalize.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32f_s32f_normalize.h 2018-03-26 22:52:55.000000000 +0000 @@ -105,6 +105,39 @@ } #endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_AVX +#include + +static inline void volk_32f_s32f_normalize_a_avx(float* vecBuffer, const float scalar, unsigned int num_points){ + unsigned int number = 0; + float* inputPtr = vecBuffer; + + const float invScalar = 1.0 / scalar; + __m256 vecScalar = _mm256_set1_ps(invScalar); + __m256 input1; + + const uint64_t eigthPoints = num_points / 8; + for(;number < eigthPoints; number++){ + + input1 = _mm256_load_ps(inputPtr); + + input1 = _mm256_mul_ps(input1, vecScalar); + + _mm256_store_ps(inputPtr, input1); + + inputPtr += 8; + } + + number = eigthPoints*8; + for(; number < num_points; number++){ + *inputPtr *= invScalar; + inputPtr++; + } +} +#endif /* LV_HAVE_AVX */ + + #ifdef LV_HAVE_GENERIC static inline void volk_32f_s32f_normalize_generic(float* vecBuffer, const float scalar, unsigned int num_points){ @@ -128,6 +161,45 @@ #endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_volk_32f_s32f_normalize_a_H */ -#endif /* INCLUDED_volk_32f_s32f_normalize_a_H */ +#ifndef INCLUDED_volk_32f_s32f_normalize_u_H +#define INCLUDED_volk_32f_s32f_normalize_u_H + +#include +#include + +#ifdef LV_HAVE_AVX +#include + +static inline void volk_32f_s32f_normalize_u_avx(float* vecBuffer, const float scalar, unsigned int num_points){ + unsigned int number = 0; + float* inputPtr = vecBuffer; + + const float invScalar = 1.0 / scalar; + __m256 vecScalar = _mm256_set1_ps(invScalar); + __m256 input1; + + const uint64_t eigthPoints = num_points / 8; + for(;number < eigthPoints; number++){ + + input1 = _mm256_loadu_ps(inputPtr); + + input1 = _mm256_mul_ps(input1, vecScalar); + + _mm256_storeu_ps(inputPtr, input1); + + inputPtr += 8; + } + + number = eigthPoints*8; + for(; number < num_points; number++){ + *inputPtr *= invScalar; + inputPtr++; + } +} +#endif /* LV_HAVE_AVX */ + + +#endif /* INCLUDED_volk_32f_s32f_normalize_u_H */ diff -Nru volk-1.3/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h volk-1.4/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h --- volk-1.3/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h 2018-03-26 22:52:55.000000000 +0000 @@ -0,0 +1,431 @@ +/* -*- c++ -*- */ +/* + Copyright (C) 2017 Free Software Foundation, Inc. + + This file is pat of libVOLK + + All rights reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License version 2.1, as + published by the Free Software Foundation. This program is + distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program; if not, see . +*/ + +/*! + * \page volk_32f_s32f_s32f_mod_range_32f + * + * \b wraps floating point numbers to stay within a defined [min,max] range + * + * Dispatcher Prototype + * \code + * void volk_32f_s32f_s32f_mod_range_32f(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points) + * \endcode + * + * \b Inputs + * \li inputVector: The input vector + * \li lower_bound: The lower output boundary + * \li upper_bound: The upper output boundary + * \li num_points The number of data points. + * + * \b Outputs + * \li outputVector: The vector where the results will be stored. + * + * \endcode + */ + +#ifndef INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H +#define INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H + +#ifdef LV_HAVE_AVX +#include + +static inline void volk_32f_s32f_s32f_mod_range_32f_u_avx(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){ + __m256 lower = _mm256_set1_ps(lower_bound); + __m256 upper = _mm256_set1_ps(upper_bound); + __m256 distance = _mm256_sub_ps(upper,lower); + float dist = upper_bound - lower_bound; + __m256 input, output; + __m256 is_smaller, is_bigger; + __m256 excess, adj; + + const float *inPtr = inputVector; + float *outPtr = outputVector; + size_t eight_points = num_points / 8; + size_t counter; + for(counter = 0; counter < eight_points; counter++) { + input = _mm256_loadu_ps(inPtr); + // calculate mask: input < lower, input > upper + is_smaller = _mm256_cmp_ps(input, lower, 0x11); //0x11: Less than, ordered, non-signalling + is_bigger = _mm256_cmp_ps(input, upper, 0x1e); //0x1e: greater than, ordered, non-signalling + // find out how far we are out-of-bound – positive values! + excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller); + excess = _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess); + // how many do we have to add? (int(excess/distance+1)*distance) + excess = _mm256_div_ps(excess, distance); + // round down + excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess)); + // plus 1 + adj = _mm256_set1_ps(1.0f); + excess = _mm256_add_ps(excess, adj); + // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} + adj = _mm256_and_ps(adj, is_smaller); + adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj); + // scale by distance, sign + excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance); + output = _mm256_add_ps(input, excess); + _mm256_storeu_ps(outPtr, output); + inPtr += 8; + outPtr += 8; + } + + size_t cnt; + for(cnt = eight_points * 8; cnt < num_points; cnt++){ + float val = inputVector[cnt]; + if(val < lower_bound){ + float excess = lower_bound - val; + signed int count = (int)(excess/dist); + outputVector[cnt] = val + (count+1)*dist; + } + else if(val > upper_bound){ + float excess = val - upper_bound; + signed int count = (int)(excess/dist); + outputVector[cnt] = val - (count+1)*dist; + } + else + outputVector[cnt] = val; + } +} +static inline void volk_32f_s32f_s32f_mod_range_32f_a_avx(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){ + __m256 lower = _mm256_set1_ps(lower_bound); + __m256 upper = _mm256_set1_ps(upper_bound); + __m256 distance = _mm256_sub_ps(upper,lower); + float dist = upper_bound - lower_bound; + __m256 input, output; + __m256 is_smaller, is_bigger; + __m256 excess, adj; + + const float *inPtr = inputVector; + float *outPtr = outputVector; + size_t eight_points = num_points / 8; + size_t counter; + for(counter = 0; counter < eight_points; counter++) { + input = _mm256_load_ps(inPtr); + // calculate mask: input < lower, input > upper + is_smaller = _mm256_cmp_ps(input, lower, 0x11); //0x11: Less than, ordered, non-signalling + is_bigger = _mm256_cmp_ps(input, upper, 0x1e); //0x1e: greater than, ordered, non-signalling + // find out how far we are out-of-bound – positive values! + excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller); + excess = _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess); + // how many do we have to add? (int(excess/distance+1)*distance) + excess = _mm256_div_ps(excess, distance); + // round down + excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess)); + // plus 1 + adj = _mm256_set1_ps(1.0f); + excess = _mm256_add_ps(excess, adj); + // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} + adj = _mm256_and_ps(adj, is_smaller); + adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj); + // scale by distance, sign + excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance); + output = _mm256_add_ps(input, excess); + _mm256_store_ps(outPtr, output); + inPtr += 8; + outPtr += 8; + } + + size_t cnt; + for(cnt = eight_points * 8; cnt < num_points; cnt++){ + float val = inputVector[cnt]; + if(val < lower_bound){ + float excess = lower_bound - val; + signed int count = (int)(excess/dist); + outputVector[cnt] = val + (count+1)*dist; + } + else if(val > upper_bound){ + float excess = val - upper_bound; + signed int count = (int)(excess/dist); + outputVector[cnt] = val - (count+1)*dist; + } + else + outputVector[cnt] = val; + } +} +#endif /* LV_HAVE_AVX */ + + +#ifdef LV_HAVE_SSE2 +#include + +static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){ + __m128 lower = _mm_set_ps1(lower_bound); + __m128 upper = _mm_set_ps1(upper_bound); + __m128 distance = _mm_sub_ps(upper,lower); + float dist = upper_bound - lower_bound; + __m128 input, output; + __m128 is_smaller, is_bigger; + __m128 excess, adj; + + const float *inPtr = inputVector; + float *outPtr = outputVector; + size_t quarter_points = num_points / 4; + size_t counter; + for(counter = 0; counter < quarter_points; counter++) { + input = _mm_load_ps(inPtr); + // calculate mask: input < lower, input > upper + is_smaller = _mm_cmplt_ps(input, lower); + is_bigger = _mm_cmpgt_ps(input, upper); + // find out how far we are out-of-bound – positive values! + excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller); + excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess); + // how many do we have to add? (int(excess/distance+1)*distance) + excess = _mm_div_ps(excess, distance); + // round down + excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess)); + // plus 1 + adj = _mm_set_ps1(1.0f); + excess = _mm_add_ps(excess, adj); + // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} + adj = _mm_and_ps(adj, is_smaller); + adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj); + // scale by distance, sign + excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance); + output = _mm_add_ps(input, excess); + _mm_store_ps(outPtr, output); + inPtr += 4; + outPtr += 4; + } + + size_t cnt; + for(cnt = quarter_points * 4; cnt < num_points; cnt++){ + float val = inputVector[cnt]; + if(val < lower_bound){ + float excess = lower_bound - val; + signed int count = (int)(excess/dist); + outputVector[cnt] = val + (count+1)*dist; + } + else if(val > upper_bound){ + float excess = val - upper_bound; + signed int count = (int)(excess/dist); + outputVector[cnt] = val - (count+1)*dist; + } + else + outputVector[cnt] = val; + } +} +static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){ + __m128 lower = _mm_set_ps1(lower_bound); + __m128 upper = _mm_set_ps1(upper_bound); + __m128 distance = _mm_sub_ps(upper,lower); + __m128 input, output; + __m128 is_smaller, is_bigger; + __m128 excess, adj; + + const float *inPtr = inputVector; + float *outPtr = outputVector; + size_t quarter_points = num_points / 4; + size_t counter; + for(counter = 0; counter < quarter_points; counter++) { + input = _mm_load_ps(inPtr); + // calculate mask: input < lower, input > upper + is_smaller = _mm_cmplt_ps(input, lower); + is_bigger = _mm_cmpgt_ps(input, upper); + // find out how far we are out-of-bound – positive values! + excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller); + excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess); + // how many do we have to add? (int(excess/distance+1)*distance) + excess = _mm_div_ps(excess, distance); + // round down – for some reason, SSE doesn't come with a 4x float -> 4x int32 conversion. + excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess)); + // plus 1 + adj = _mm_set_ps1(1.0f); + excess = _mm_add_ps(excess, adj); + // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} + adj = _mm_and_ps(adj, is_smaller); + adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj); + // scale by distance, sign + excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance); + output = _mm_add_ps(input, excess); + _mm_store_ps(outPtr, output); + inPtr += 4; + outPtr += 4; + } + + float dist = upper_bound - lower_bound; + size_t cnt; + for(cnt = quarter_points * 4; cnt < num_points; cnt++){ + float val = inputVector[cnt]; + if(val < lower_bound){ + float excess = lower_bound - val; + signed int count = (int)(excess/dist); + outputVector[cnt] = val + (count+1)*dist; + } + else if(val > upper_bound){ + float excess = val - upper_bound; + signed int count = (int)(excess/dist); + outputVector[cnt] = val - (count+1)*dist; + } + else + outputVector[cnt] = val; + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_SSE +#include + +static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){ + __m128 lower = _mm_set_ps1(lower_bound); + __m128 upper = _mm_set_ps1(upper_bound); + __m128 distance = _mm_sub_ps(upper,lower); + float dist = upper_bound - lower_bound; + __m128 input, output; + __m128 is_smaller, is_bigger; + __m128 excess, adj; + __m128i rounddown; + + const float *inPtr = inputVector; + float *outPtr = outputVector; + size_t quarter_points = num_points / 4; + size_t counter; + for(counter = 0; counter < quarter_points; counter++) { + input = _mm_load_ps(inPtr); + // calculate mask: input < lower, input > upper + is_smaller = _mm_cmplt_ps(input, lower); + is_bigger = _mm_cmpgt_ps(input, upper); + // find out how far we are out-of-bound – positive values! + excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller); + excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess); + // how many do we have to add? (int(excess/distance+1)*distance) + excess = _mm_div_ps(excess, distance); + // round down – for some reason + rounddown = _mm_cvttps_epi32(excess); + excess = _mm_cvtepi32_ps(rounddown); + // plus 1 + adj = _mm_set_ps1(1.0f); + excess = _mm_add_ps(excess, adj); + // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} + adj = _mm_and_ps(adj, is_smaller); + adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj); + // scale by distance, sign + excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance); + output = _mm_add_ps(input, excess); + _mm_store_ps(outPtr, output); + inPtr += 4; + outPtr += 4; + } + + size_t cnt; + for(cnt = quarter_points * 4; cnt < num_points; cnt++){ + float val = inputVector[cnt]; + if(val < lower_bound){ + float excess = lower_bound - val; + signed int count = (int)(excess/dist); + outputVector[cnt] = val + (count+1)*dist; + } + else if(val > upper_bound){ + float excess = val - upper_bound; + signed int count = (int)(excess/dist); + outputVector[cnt] = val - (count+1)*dist; + } + else + outputVector[cnt] = val; + } +} +static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){ + __m128 lower = _mm_set_ps1(lower_bound); + __m128 upper = _mm_set_ps1(upper_bound); + __m128 distance = _mm_sub_ps(upper,lower); + __m128 input, output; + __m128 is_smaller, is_bigger; + __m128 excess, adj; + __m128i rounddown; + + const float *inPtr = inputVector; + float *outPtr = outputVector; + size_t quarter_points = num_points / 4; + size_t counter; + for(counter = 0; counter < quarter_points; counter++) { + input = _mm_load_ps(inPtr); + // calculate mask: input < lower, input > upper + is_smaller = _mm_cmplt_ps(input, lower); + is_bigger = _mm_cmpgt_ps(input, upper); + // find out how far we are out-of-bound – positive values! + excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller); + excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess); + // how many do we have to add? (int(excess/distance+1)*distance) + excess = _mm_div_ps(excess, distance); + // round down + rounddown = _mm_cvttps_epi32(excess); + excess = _mm_cvtepi32_ps(rounddown); + // plus 1 + adj = _mm_set_ps1(1.0f); + excess = _mm_add_ps(excess, adj); + // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} + adj = _mm_and_ps(adj, is_smaller); + adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj); + // scale by distance, sign + excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance); + output = _mm_add_ps(input, excess); + _mm_store_ps(outPtr, output); + inPtr += 4; + outPtr += 4; + } + + float dist = upper_bound - lower_bound; + size_t cnt; + for(cnt = quarter_points * 4; cnt < num_points; cnt++){ + float val = inputVector[cnt]; + if(val < lower_bound){ + float excess = lower_bound - val; + signed int count = (int)(excess/dist); + outputVector[cnt] = val + (count+1)*dist; + } + else if(val > upper_bound){ + float excess = val - upper_bound; + signed int count = (int)(excess/dist); + outputVector[cnt] = val - (count+1)*dist; + } + else + outputVector[cnt] = val; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + +static inline void volk_32f_s32f_s32f_mod_range_32f_generic(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){ + float* outPtr = outputVector; + const float *inPtr; + float distance = upper_bound - lower_bound; + + for(inPtr = inputVector; inPtr < inputVector + num_points; inPtr++){ + float val = *inPtr; + if(val < lower_bound){ + float excess = lower_bound - val; + signed int count = (int)(excess/distance); + *outPtr = val + (count+1)*distance; + } + else if(val > upper_bound){ + float excess = val - upper_bound; + signed int count = (int)(excess/distance); + *outPtr = val - (count+1)*distance; + } + else + *outPtr = val; + outPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H */ diff -Nru volk-1.3/kernels/volk/volk_32f_s32f_stddev_32f.h volk-1.4/kernels/volk/volk_32f_s32f_stddev_32f.h --- volk-1.3/kernels/volk/volk_32f_s32f_stddev_32f.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32f_s32f_stddev_32f.h 2018-03-26 22:52:55.000000000 +0000 @@ -132,6 +132,65 @@ #endif /* LV_HAVE_SSE4_1 */ +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32f_s32f_stddev_32f_a_avx(float* stddev, const float* inputBuffer, + const float mean, unsigned int num_points) +{ + float returnValue = 0; + if(num_points > 0){ + unsigned int number = 0; + const unsigned int thirtySecondPoints = num_points / 32; + + const float* aPtr = inputBuffer; + + __VOLK_ATTR_ALIGNED(32) float squareBuffer[8]; + + __m256 squareAccumulator = _mm256_setzero_ps(); + __m256 aVal1, aVal2, aVal3, aVal4; + __m256 cVal1, cVal2, cVal3, cVal4; + for(;number < thirtySecondPoints; number++) { + aVal1 = _mm256_load_ps(aPtr); aPtr += 8; + cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1); + + aVal2 = _mm256_load_ps(aPtr); aPtr += 8; + cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2); + + aVal3 = _mm256_load_ps(aPtr); aPtr += 8; + cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4); + + aVal4 = _mm256_load_ps(aPtr); aPtr += 8; + cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8); + + cVal1 = _mm256_or_ps(cVal1, cVal2); + cVal3 = _mm256_or_ps(cVal3, cVal4); + cVal1 = _mm256_or_ps(cVal1, cVal3); + + squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 + } + _mm256_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + returnValue = squareBuffer[0]; returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; returnValue += squareBuffer[3]; + returnValue += squareBuffer[4]; returnValue += squareBuffer[5]; + returnValue += squareBuffer[6]; returnValue += squareBuffer[7]; + + number = thirtySecondPoints * 32; + for(;number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + aPtr++; + } + returnValue /= num_points; + returnValue -= (mean * mean); + returnValue = sqrtf(returnValue); + } + *stddev = returnValue; +} + +#endif /* LV_HAVE_AVX */ + + #ifdef LV_HAVE_SSE #include diff -Nru volk-1.3/kernels/volk/volk_32f_sin_32f.h volk-1.4/kernels/volk/volk_32f_sin_32f.h --- volk-1.3/kernels/volk/volk_32f_sin_32f.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32f_sin_32f.h 2018-03-26 22:52:55.000000000 +0000 @@ -146,7 +146,7 @@ number = quarterPoints * 4; for(;number < num_points; number++) { - *bPtr++ = sin(*aPtr++); + *bPtr++ = sinf(*aPtr++); } } @@ -226,7 +226,7 @@ number = quarterPoints * 4; for(;number < num_points; number++){ - *bPtr++ = sin(*aPtr++); + *bPtr++ = sinf(*aPtr++); } } @@ -243,7 +243,7 @@ unsigned int number = 0; for(number = 0; number < num_points; number++) { - *bPtr++ = sin(*aPtr++); + *bPtr++ = sinf(*aPtr++); } } diff -Nru volk-1.3/kernels/volk/volk_32f_sqrt_32f.h volk-1.4/kernels/volk/volk_32f_sqrt_32f.h --- volk-1.3/kernels/volk/volk_32f_sqrt_32f.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32f_sqrt_32f.h 2018-03-26 22:52:55.000000000 +0000 @@ -102,6 +102,39 @@ #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32f_sqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int eigthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m256 aVal, cVal; + for(;number < eigthPoints; number++) { + aVal = _mm256_load_ps(aPtr); + + cVal = _mm256_sqrt_ps(aVal); + + _mm256_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + cPtr += 8; + } + + number = eigthPoints * 8; + for(;number < num_points; number++) { + *cPtr++ = sqrtf(*aPtr++); + } +} + +#endif /* LV_HAVE_AVX */ + + #ifdef LV_HAVE_NEON #include diff -Nru volk-1.3/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h volk-1.4/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h --- volk-1.3/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h 2018-03-26 22:52:55.000000000 +0000 @@ -76,9 +76,170 @@ #include #include +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32f_stddev_and_mean_32f_x2_a_avx(float* stddev, float* mean, + const float* inputBuffer, + unsigned int num_points) +{ + float stdDev = 0; + float newMean = 0; + if(num_points > 0){ + unsigned int number = 0; + const unsigned int thirtySecondthPoints = num_points / 32; + + const float* aPtr = inputBuffer; + __VOLK_ATTR_ALIGNED(32) float meanBuffer[8]; + __VOLK_ATTR_ALIGNED(32) float squareBuffer[8]; + + __m256 accumulator = _mm256_setzero_ps(); + __m256 squareAccumulator = _mm256_setzero_ps(); + __m256 aVal1, aVal2, aVal3, aVal4; + __m256 cVal1, cVal2, cVal3, cVal4; + for(;number < thirtySecondthPoints; number++) { + aVal1 = _mm256_load_ps(aPtr); aPtr += 8; + cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1); + accumulator = _mm256_add_ps(accumulator, aVal1); // accumulator += x + + aVal2 = _mm256_load_ps(aPtr); aPtr += 8; + cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2); + accumulator = _mm256_add_ps(accumulator, aVal2); // accumulator += x + + aVal3 = _mm256_load_ps(aPtr); aPtr += 8; + cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4); + accumulator = _mm256_add_ps(accumulator, aVal3); // accumulator += x + + aVal4 = _mm256_load_ps(aPtr); aPtr += 8; + cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8); + accumulator = _mm256_add_ps(accumulator, aVal4); // accumulator += x + + cVal1 = _mm256_or_ps(cVal1, cVal2); + cVal3 = _mm256_or_ps(cVal3, cVal4); + cVal1 = _mm256_or_ps(cVal1, cVal3); + + squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 + } + _mm256_store_ps(meanBuffer,accumulator); // Store the results back into the C container + _mm256_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + newMean = meanBuffer[0]; + newMean += meanBuffer[1]; + newMean += meanBuffer[2]; + newMean += meanBuffer[3]; + newMean += meanBuffer[4]; + newMean += meanBuffer[5]; + newMean += meanBuffer[6]; + newMean += meanBuffer[7]; + stdDev = squareBuffer[0]; + stdDev += squareBuffer[1]; + stdDev += squareBuffer[2]; + stdDev += squareBuffer[3]; + stdDev += squareBuffer[4]; + stdDev += squareBuffer[5]; + stdDev += squareBuffer[6]; + stdDev += squareBuffer[7]; + + number = thirtySecondthPoints * 32; + for(;number < num_points; number++){ + stdDev += (*aPtr) * (*aPtr); + newMean += *aPtr++; + } + newMean /= num_points; + stdDev /= num_points; + stdDev -= (newMean * newMean); + stdDev = sqrtf(stdDev); + } + *stddev = stdDev; + *mean = newMean; + +} +#endif /* LV_HAVE_AVX */ + + +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32f_stddev_and_mean_32f_x2_u_avx(float* stddev, float* mean, + const float* inputBuffer, + unsigned int num_points) +{ + float stdDev = 0; + float newMean = 0; + if(num_points > 0){ + unsigned int number = 0; + const unsigned int thirtySecondthPoints = num_points / 32; + + const float* aPtr = inputBuffer; + __VOLK_ATTR_ALIGNED(32) float meanBuffer[8]; + __VOLK_ATTR_ALIGNED(32) float squareBuffer[8]; + + __m256 accumulator = _mm256_setzero_ps(); + __m256 squareAccumulator = _mm256_setzero_ps(); + __m256 aVal1, aVal2, aVal3, aVal4; + __m256 cVal1, cVal2, cVal3, cVal4; + for(;number < thirtySecondthPoints; number++) { + aVal1 = _mm256_loadu_ps(aPtr); aPtr += 8; + cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1); + accumulator = _mm256_add_ps(accumulator, aVal1); // accumulator += x + + aVal2 = _mm256_loadu_ps(aPtr); aPtr += 8; + cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2); + accumulator = _mm256_add_ps(accumulator, aVal2); // accumulator += x + + aVal3 = _mm256_loadu_ps(aPtr); aPtr += 8; + cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4); + accumulator = _mm256_add_ps(accumulator, aVal3); // accumulator += x + + aVal4 = _mm256_loadu_ps(aPtr); aPtr += 8; + cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8); + accumulator = _mm256_add_ps(accumulator, aVal4); // accumulator += x + + cVal1 = _mm256_or_ps(cVal1, cVal2); + cVal3 = _mm256_or_ps(cVal3, cVal4); + cVal1 = _mm256_or_ps(cVal1, cVal3); + + squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 + } + _mm256_store_ps(meanBuffer,accumulator); // Store the results back into the C container + _mm256_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + newMean = meanBuffer[0]; + newMean += meanBuffer[1]; + newMean += meanBuffer[2]; + newMean += meanBuffer[3]; + newMean += meanBuffer[4]; + newMean += meanBuffer[5]; + newMean += meanBuffer[6]; + newMean += meanBuffer[7]; + stdDev = squareBuffer[0]; + stdDev += squareBuffer[1]; + stdDev += squareBuffer[2]; + stdDev += squareBuffer[3]; + stdDev += squareBuffer[4]; + stdDev += squareBuffer[5]; + stdDev += squareBuffer[6]; + stdDev += squareBuffer[7]; + + number = thirtySecondthPoints * 32; + for(;number < num_points; number++){ + stdDev += (*aPtr) * (*aPtr); + newMean += *aPtr++; + } + newMean /= num_points; + stdDev /= num_points; + stdDev -= (newMean * newMean); + stdDev = sqrtf(stdDev); + } + *stddev = stdDev; + *mean = newMean; + +} +#endif /* LV_HAVE_AVX */ + + #ifdef LV_HAVE_SSE4_1 #include - static inline void volk_32f_stddev_and_mean_32f_x2_a_sse4_1(float* stddev, float* mean, const float* inputBuffer, diff -Nru volk-1.3/kernels/volk/volk_32f_tan_32f.h volk-1.4/kernels/volk/volk_32f_tan_32f.h --- volk-1.3/kernels/volk/volk_32f_tan_32f.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32f_tan_32f.h 2018-03-26 22:52:55.000000000 +0000 @@ -152,7 +152,7 @@ number = quarterPoints * 4; for(;number < num_points; number++){ - *bPtr++ = tan(*aPtr++); + *bPtr++ = tanf(*aPtr++); } } @@ -237,7 +237,7 @@ number = quarterPoints * 4; for(;number < num_points; number++){ - *bPtr++ = tan(*aPtr++); + *bPtr++ = tanf(*aPtr++); } } @@ -255,7 +255,7 @@ unsigned int number = 0; for(; number < num_points; number++){ - *bPtr++ = tan(*aPtr++); + *bPtr++ = tanf(*aPtr++); } } #endif /* LV_HAVE_GENERIC */ diff -Nru volk-1.3/kernels/volk/volk_32f_tanh_32f.h volk-1.4/kernels/volk/volk_32f_tanh_32f.h --- volk-1.3/kernels/volk/volk_32f_tanh_32f.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32f_tanh_32f.h 2018-03-26 22:52:55.000000000 +0000 @@ -83,7 +83,7 @@ float* cPtr = cVector; const float* aPtr = aVector; for(; number < num_points; number++) { - *cPtr++ = tanh(*aPtr++); + *cPtr++ = tanhf(*aPtr++); } } diff -Nru volk-1.3/kernels/volk/volk_32f_x2_add_32f.h volk-1.4/kernels/volk/volk_32f_x2_add_32f.h --- volk-1.3/kernels/volk/volk_32f_x2_add_32f.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32f_x2_add_32f.h 2018-03-26 22:52:55.000000000 +0000 @@ -76,6 +76,43 @@ #include #include +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32f_x2_add_32f_u_avx(float* cVector, const float* aVector, + const float* bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m256 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm256_loadu_ps(aPtr); + bVal = _mm256_loadu_ps(bPtr); + + cVal = _mm256_add_ps(aVal, bVal); + + _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = quarterPoints * 8; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_AVX */ + + #ifdef LV_HAVE_SSE #include @@ -191,8 +228,8 @@ // Load in to NEON registers aVal = vld1q_f32(aPtr); bVal = vld1q_f32(bPtr); - __builtin_prefetch(aPtr+4); - __builtin_prefetch(bPtr+4); + __VOLK_PREFETCH(aPtr+4); + __VOLK_PREFETCH(bPtr+4); // vector add cVal = vaddq_f32(aVal, bVal); @@ -213,11 +250,11 @@ #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_NEON -extern void volk_32f_x2_add_32f_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +extern void volk_32f_x2_add_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_NEON -extern void volk_32f_x2_add_32f_neonpipeline(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +extern void volk_32f_x2_add_32f_a_neonpipeline(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_GENERIC diff -Nru volk-1.3/kernels/volk/volk_32f_x2_divide_32f.h volk-1.4/kernels/volk/volk_32f_x2_divide_32f.h --- volk-1.3/kernels/volk/volk_32f_x2_divide_32f.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32f_x2_divide_32f.h 2018-03-26 22:52:55.000000000 +0000 @@ -110,6 +110,97 @@ #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32f_x2_divide_32f_a_avx(float* cVector, const float* aVector, + const float* bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int eigthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m256 aVal, bVal, cVal; + for(;number < eigthPoints; number++){ + aVal = _mm256_load_ps(aPtr); + bVal = _mm256_load_ps(bPtr); + + cVal = _mm256_div_ps(aVal, bVal); + + _mm256_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eigthPoints * 8; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) / (*bPtr++); + } +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_NEON +#include + +static inline void +volk_32f_x2_divide_32f_neon(float* cVector, const float* aVector, + const float* bVector, unsigned int num_points) +{ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; + + float32x4x4_t aVal, bVal, bInv, cVal; + + const unsigned int eighthPoints = num_points / 16; + unsigned int number = 0; + for(; number < eighthPoints; number++){ + aVal = vld4q_f32(aPtr); + aPtr += 16; + bVal = vld4q_f32(bPtr); + bPtr += 16; + + __VOLK_PREFETCH(aPtr+16); + __VOLK_PREFETCH(bPtr+16); + + bInv.val[0] = vrecpeq_f32(bVal.val[0]); + bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0])); + bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0])); + cVal.val[0] = vmulq_f32(aVal.val[0], bInv.val[0]); + + bInv.val[1] = vrecpeq_f32(bVal.val[1]); + bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1])); + bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1])); + cVal.val[1] = vmulq_f32(aVal.val[1], bInv.val[1]); + + bInv.val[2] = vrecpeq_f32(bVal.val[2]); + bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2])); + bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2])); + cVal.val[2] = vmulq_f32(aVal.val[2], bInv.val[2]); + + bInv.val[3] = vrecpeq_f32(bVal.val[3]); + bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3])); + bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3])); + cVal.val[3] = vmulq_f32(aVal.val[3], bInv.val[3]); + + vst4q_f32(cPtr, cVal); + cPtr += 16; + } + + for(number = eighthPoints * 16; number < num_points; number++){ + *cPtr++ = (*aPtr++) / (*bPtr++); + } +} + +#endif /* LV_HAVE_NEON */ + + #ifdef LV_HAVE_GENERIC static inline void @@ -145,3 +236,47 @@ #endif /* INCLUDED_volk_32f_x2_divide_32f_a_H */ + + +#ifndef INCLUDED_volk_32f_x2_divide_32f_u_H +#define INCLUDED_volk_32f_x2_divide_32f_u_H + +#include +#include + +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32f_x2_divide_32f_u_avx(float* cVector, const float* aVector, + const float* bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int eigthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m256 aVal, bVal, cVal; + for(;number < eigthPoints; number++){ + aVal = _mm256_loadu_ps(aPtr); + bVal = _mm256_loadu_ps(bPtr); + + cVal = _mm256_div_ps(aVal, bVal); + + _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eigthPoints * 8; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) / (*bPtr++); + } +} +#endif /* LV_HAVE_AVX */ + +#endif /* INCLUDED_volk_32f_x2_divide_32f_u_H */ diff -Nru volk-1.3/kernels/volk/volk_32f_x2_dot_prod_16i.h volk-1.4/kernels/volk/volk_32f_x2_dot_prod_16i.h --- volk-1.3/kernels/volk/volk_32f_x2_dot_prod_16i.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32f_x2_dot_prod_16i.h 2018-03-26 22:52:55.000000000 +0000 @@ -82,6 +82,154 @@ #endif /*LV_HAVE_GENERIC*/ +#ifdef LV_HAVE_AVX + +static inline void volk_32f_x2_dot_prod_16i_a_avx(int16_t* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int thirtySecondPoints = num_points / 32; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m256 a0Val, a1Val, a2Val, a3Val; + __m256 b0Val, b1Val, b2Val, b3Val; + __m256 c0Val, c1Val, c2Val, c3Val; + + __m256 dotProdVal0 = _mm256_setzero_ps(); + __m256 dotProdVal1 = _mm256_setzero_ps(); + __m256 dotProdVal2 = _mm256_setzero_ps(); + __m256 dotProdVal3 = _mm256_setzero_ps(); + + for(;number < thirtySecondPoints; number++){ + + a0Val = _mm256_load_ps(aPtr); + a1Val = _mm256_load_ps(aPtr+8); + a2Val = _mm256_load_ps(aPtr+16); + a3Val = _mm256_load_ps(aPtr+24); + + b0Val = _mm256_load_ps(bPtr); + b1Val = _mm256_load_ps(bPtr+8); + b2Val = _mm256_load_ps(bPtr+16); + b3Val = _mm256_load_ps(bPtr+24); + + c0Val = _mm256_mul_ps(a0Val, b0Val); + c1Val = _mm256_mul_ps(a1Val, b1Val); + c2Val = _mm256_mul_ps(a2Val, b2Val); + c3Val = _mm256_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); + + aPtr += 32; + bPtr += 32; + } + + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; + + _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + dotProduct += dotProductVector[4]; + dotProduct += dotProductVector[5]; + dotProduct += dotProductVector[6]; + dotProduct += dotProductVector[7]; + + number = thirtySecondPoints*32; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = (short)dotProduct; +} + +#endif /*LV_HAVE_AVX*/ + + +#ifdef LV_HAVE_AVX + +static inline void volk_32f_x2_dot_prod_16i_u_avx(int16_t* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int thirtySecondPoints = num_points / 32; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m256 a0Val, a1Val, a2Val, a3Val; + __m256 b0Val, b1Val, b2Val, b3Val; + __m256 c0Val, c1Val, c2Val, c3Val; + + __m256 dotProdVal0 = _mm256_setzero_ps(); + __m256 dotProdVal1 = _mm256_setzero_ps(); + __m256 dotProdVal2 = _mm256_setzero_ps(); + __m256 dotProdVal3 = _mm256_setzero_ps(); + + for(;number < thirtySecondPoints; number++){ + + a0Val = _mm256_loadu_ps(aPtr); + a1Val = _mm256_loadu_ps(aPtr+8); + a2Val = _mm256_loadu_ps(aPtr+16); + a3Val = _mm256_loadu_ps(aPtr+24); + + b0Val = _mm256_loadu_ps(bPtr); + b1Val = _mm256_loadu_ps(bPtr+8); + b2Val = _mm256_loadu_ps(bPtr+16); + b3Val = _mm256_loadu_ps(bPtr+24); + + c0Val = _mm256_mul_ps(a0Val, b0Val); + c1Val = _mm256_mul_ps(a1Val, b1Val); + c2Val = _mm256_mul_ps(a2Val, b2Val); + c3Val = _mm256_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); + + aPtr += 32; + bPtr += 32; + } + + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; + + _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + dotProduct += dotProductVector[4]; + dotProduct += dotProductVector[5]; + dotProduct += dotProductVector[6]; + dotProduct += dotProductVector[7]; + + number = thirtySecondPoints*32; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = (short)dotProduct; +} + +#endif /*LV_HAVE_AVX*/ + + #ifdef LV_HAVE_SSE static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, const float* input, const float* taps, unsigned int num_points) { diff -Nru volk-1.3/kernels/volk/volk_32f_x2_dot_prod_32f.h volk-1.4/kernels/volk/volk_32f_x2_dot_prod_32f.h --- volk-1.3/kernels/volk/volk_32f_x2_dot_prod_32f.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32f_x2_dot_prod_32f.h 2018-03-26 22:52:55.000000000 +0000 @@ -774,11 +774,11 @@ #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_NEON -extern void volk_32f_x2_dot_prod_32f_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +extern void volk_32f_x2_dot_prod_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_NEON -extern void volk_32f_x2_dot_prod_32f_neonasm_opts(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +extern void volk_32f_x2_dot_prod_32f_a_neonasm_opts(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); #endif /* LV_HAVE_NEON */ #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/ diff -Nru volk-1.3/kernels/volk/volk_32f_x2_max_32f.h volk-1.4/kernels/volk/volk_32f_x2_max_32f.h --- volk-1.3/kernels/volk/volk_32f_x2_max_32f.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32f_x2_max_32f.h 2018-03-26 22:52:55.000000000 +0000 @@ -112,6 +112,44 @@ #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32f_x2_max_32f_a_avx(float* cVector, const float* aVector, + const float* bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int eigthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m256 aVal, bVal, cVal; + for(;number < eigthPoints; number++){ + aVal = _mm256_load_ps(aPtr); + bVal = _mm256_load_ps(bPtr); + + cVal = _mm256_max_ps(aVal, bVal); + + _mm256_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eigthPoints * 8; + for(;number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_AVX */ + + #ifdef LV_HAVE_NEON #include @@ -180,3 +218,49 @@ #endif /* INCLUDED_volk_32f_x2_max_32f_a_H */ + + +#ifndef INCLUDED_volk_32f_x2_max_32f_u_H +#define INCLUDED_volk_32f_x2_max_32f_u_H + +#include +#include + +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32f_x2_max_32f_u_avx(float* cVector, const float* aVector, + const float* bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int eigthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m256 aVal, bVal, cVal; + for(;number < eigthPoints; number++){ + aVal = _mm256_loadu_ps(aPtr); + bVal = _mm256_loadu_ps(bPtr); + + cVal = _mm256_max_ps(aVal, bVal); + + _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eigthPoints * 8; + for(;number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_AVX */ + +#endif /* INCLUDED_volk_32f_x2_max_32f_u_H */ diff -Nru volk-1.3/kernels/volk/volk_32f_x2_min_32f.h volk-1.4/kernels/volk/volk_32f_x2_min_32f.h --- volk-1.3/kernels/volk/volk_32f_x2_min_32f.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32f_x2_min_32f.h 2018-03-26 22:52:55.000000000 +0000 @@ -112,6 +112,44 @@ #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32f_x2_min_32f_a_avx(float* cVector, const float* aVector, + const float* bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int eigthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m256 aVal, bVal, cVal; + for(;number < eigthPoints; number++){ + aVal = _mm256_load_ps(aPtr); + bVal = _mm256_load_ps(bPtr); + + cVal = _mm256_min_ps(aVal, bVal); + + _mm256_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eigthPoints * 8; + for(;number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_AVX */ + + #ifdef LV_HAVE_NEON #include @@ -183,3 +221,49 @@ #endif /* INCLUDED_volk_32f_x2_min_32f_a_H */ + + +#ifndef INCLUDED_volk_32f_x2_min_32f_u_H +#define INCLUDED_volk_32f_x2_min_32f_u_H + +#include +#include + +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32f_x2_min_32f_u_avx(float* cVector, const float* aVector, + const float* bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int eigthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m256 aVal, bVal, cVal; + for(;number < eigthPoints; number++){ + aVal = _mm256_loadu_ps(aPtr); + bVal = _mm256_loadu_ps(bPtr); + + cVal = _mm256_min_ps(aVal, bVal); + + _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eigthPoints * 8; + for(;number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_AVX */ + +#endif /* INCLUDED_volk_32f_x2_min_32f_u_H */ diff -Nru volk-1.3/kernels/volk/volk_32f_x2_pow_32f.h volk-1.4/kernels/volk/volk_32f_x2_pow_32f.h --- volk-1.3/kernels/volk/volk_32f_x2_pow_32f.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32f_x2_pow_32f.h 2018-03-26 22:52:55.000000000 +0000 @@ -190,7 +190,7 @@ number = quarterPoints * 4; for(;number < num_points; number++){ - *cPtr++ = pow(*aPtr++, *bPtr++); + *cPtr++ = powf(*aPtr++, *bPtr++); } } @@ -215,7 +215,7 @@ unsigned int number = 0; for(number = 0; number < num_points; number++){ - *cPtr++ = pow(*aPtr++, *bPtr++); + *cPtr++ = powf(*aPtr++, *bPtr++); } } #endif /* LV_HAVE_GENERIC */ @@ -326,7 +326,7 @@ number = quarterPoints * 4; for(;number < num_points; number++){ - *cPtr++ = pow(*aPtr++, *bPtr++); + *cPtr++ = powf(*aPtr++, *bPtr++); } } diff -Nru volk-1.3/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h volk-1.4/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h --- volk-1.3/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h 2018-03-26 22:52:55.000000000 +0000 @@ -79,6 +79,60 @@ #include #include +#ifdef LV_HAVE_AVX2 +#include + +static inline void +volk_32f_x2_s32f_interleave_16ic_a_avx2(lv_16sc_t* complexVector, const float* iBuffer, + const float* qBuffer, const float scalar, unsigned int num_points) +{ + unsigned int number = 0; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + + __m256 vScalar = _mm256_set1_ps(scalar); + + const unsigned int eighthPoints = num_points / 8; + + __m256 iValue, qValue, cplxValue1, cplxValue2; + __m256i intValue1, intValue2; + + int16_t* complexVectorPtr = (int16_t*)complexVector; + + for(;number < eighthPoints; number++){ + iValue = _mm256_load_ps(iBufferPtr); + qValue = _mm256_load_ps(qBufferPtr); + + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue1 = _mm256_unpacklo_ps(iValue, qValue); + cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar); + + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue2 = _mm256_unpackhi_ps(iValue, qValue); + cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar); + + intValue1 = _mm256_cvtps_epi32(cplxValue1); + intValue2 = _mm256_cvtps_epi32(cplxValue2); + + intValue1 = _mm256_packs_epi32(intValue1, intValue2); + + _mm256_store_si256((__m256i*)complexVectorPtr, intValue1); + complexVectorPtr += 16; + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + complexVectorPtr = (int16_t*)(&complexVector[number]); + for(; number < num_points; number++){ + *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); + *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); + } +} +#endif /* LV_HAVE_AVX2 */ + + #ifdef LV_HAVE_SSE2 #include @@ -214,3 +268,66 @@ #endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H */ + +#ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H +#define INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H + +#include +#include +#include + +#ifdef LV_HAVE_AVX2 +#include + +static inline void +volk_32f_x2_s32f_interleave_16ic_u_avx2(lv_16sc_t* complexVector, const float* iBuffer, + const float* qBuffer, const float scalar, unsigned int num_points) +{ + unsigned int number = 0; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + + __m256 vScalar = _mm256_set1_ps(scalar); + + const unsigned int eighthPoints = num_points / 8; + + __m256 iValue, qValue, cplxValue1, cplxValue2; + __m256i intValue1, intValue2; + + int16_t* complexVectorPtr = (int16_t*)complexVector; + + for(;number < eighthPoints; number++){ + iValue = _mm256_loadu_ps(iBufferPtr); + qValue = _mm256_loadu_ps(qBufferPtr); + + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue1 = _mm256_unpacklo_ps(iValue, qValue); + cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar); + + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue2 = _mm256_unpackhi_ps(iValue, qValue); + cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar); + + intValue1 = _mm256_cvtps_epi32(cplxValue1); + intValue2 = _mm256_cvtps_epi32(cplxValue2); + + intValue1 = _mm256_packs_epi32(intValue1, intValue2); + + _mm256_storeu_si256((__m256i*)complexVectorPtr, intValue1); + complexVectorPtr += 16; + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + complexVectorPtr = (int16_t*)(&complexVector[number]); + for(; number < num_points; number++){ + *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); + *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); + } +} +#endif /* LV_HAVE_AVX2 */ + + +#endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H */ diff -Nru volk-1.3/kernels/volk/volk_32f_x2_subtract_32f.h volk-1.4/kernels/volk/volk_32f_x2_subtract_32f.h --- volk-1.3/kernels/volk/volk_32f_x2_subtract_32f.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32f_x2_subtract_32f.h 2018-03-26 22:52:55.000000000 +0000 @@ -111,6 +111,43 @@ #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32f_x2_subtract_32f_a_avx(float* cVector, const float* aVector, + const float* bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; + + __m256 aVal, bVal, cVal; + for(;number < eighthPoints; number++){ + + aVal = _mm256_load_ps(aPtr); + bVal = _mm256_load_ps(bPtr); + + cVal = _mm256_sub_ps(aVal, bVal); + + _mm256_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) - (*bPtr++); + } +} +#endif /* LV_HAVE_AVX */ + + #ifdef LV_HAVE_GENERIC static inline void @@ -176,3 +213,48 @@ #endif /* INCLUDED_volk_32f_x2_subtract_32f_a_H */ + + +#ifndef INCLUDED_volk_32f_x2_subtract_32f_u_H +#define INCLUDED_volk_32f_x2_subtract_32f_u_H + +#include +#include + +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32f_x2_subtract_32f_u_avx(float* cVector, const float* aVector, + const float* bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; + + __m256 aVal, bVal, cVal; + for(;number < eighthPoints; number++){ + + aVal = _mm256_loadu_ps(aPtr); + bVal = _mm256_loadu_ps(bPtr); + + cVal = _mm256_sub_ps(aVal, bVal); + + _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) - (*bPtr++); + } +} +#endif /* LV_HAVE_AVX */ + +#endif /* INCLUDED_volk_32f_x2_subtract_32f_u_H */ diff -Nru volk-1.3/kernels/volk/volk_32i_x2_and_32i.h volk-1.4/kernels/volk/volk_32i_x2_and_32i.h --- volk-1.3/kernels/volk/volk_32i_x2_and_32i.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32i_x2_and_32i.h 2018-03-26 22:52:55.000000000 +0000 @@ -84,6 +84,43 @@ #include #include +#ifdef LV_HAVE_AVX2 +#include + +static inline void +volk_32i_x2_and_32i_a_avx2(int32_t* cVector, const int32_t* aVector, + const int32_t* bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int oneEightPoints = num_points / 8; + + int32_t* cPtr = cVector; + const int32_t* aPtr = aVector; + const int32_t* bPtr = bVector; + + __m256i aVal, bVal, cVal; + for(;number < oneEightPoints; number++){ + + aVal = _mm256_load_si256((__m256i*)aPtr); + bVal = _mm256_load_si256((__m256i*)bPtr); + + cVal = _mm256_and_si256(aVal, bVal); + + _mm256_store_si256((__m256i*)cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = oneEightPoints * 8; + for(;number < num_points; number++){ + cVector[number] = aVector[number] & bVector[number]; + } +} +#endif /* LV_HAVE_AVX2 */ + + #ifdef LV_HAVE_SSE #include @@ -185,3 +222,49 @@ #endif /* INCLUDED_volk_32i_x2_and_32i_a_H */ + + +#ifndef INCLUDED_volk_32i_x2_and_32i_u_H +#define INCLUDED_volk_32i_x2_and_32i_u_H + +#include +#include + +#ifdef LV_HAVE_AVX2 +#include + +static inline void +volk_32i_x2_and_32i_u_avx2(int32_t* cVector, const int32_t* aVector, + const int32_t* bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int oneEightPoints = num_points / 8; + + int32_t* cPtr = cVector; + const int32_t* aPtr = aVector; + const int32_t* bPtr = bVector; + + __m256i aVal, bVal, cVal; + for(;number < oneEightPoints; number++){ + + aVal = _mm256_loadu_si256((__m256i*)aPtr); + bVal = _mm256_loadu_si256((__m256i*)bPtr); + + cVal = _mm256_and_si256(aVal, bVal); + + _mm256_storeu_si256((__m256i*)cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = oneEightPoints * 8; + for(;number < num_points; number++){ + cVector[number] = aVector[number] & bVector[number]; + } +} +#endif /* LV_HAVE_AVX2 */ + + +#endif /* INCLUDED_volk_32i_x2_and_32i_u_H */ diff -Nru volk-1.3/kernels/volk/volk_32i_x2_or_32i.h volk-1.4/kernels/volk/volk_32i_x2_or_32i.h --- volk-1.3/kernels/volk/volk_32i_x2_or_32i.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32i_x2_or_32i.h 2018-03-26 22:52:55.000000000 +0000 @@ -84,6 +84,43 @@ #include #include +#ifdef LV_HAVE_AVX2 +#include + +static inline void +volk_32i_x2_or_32i_a_avx2(int32_t* cVector, const int32_t* aVector, + const int32_t* bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int oneEightPoints = num_points / 8; + + int32_t* cPtr = cVector; + const int32_t* aPtr = aVector; + const int32_t* bPtr = bVector; + + __m256i aVal, bVal, cVal; + for(;number < oneEightPoints; number++){ + + aVal = _mm256_load_si256((__m256i*)aPtr); + bVal = _mm256_load_si256((__m256i*)bPtr); + + cVal = _mm256_or_si256(aVal, bVal); + + _mm256_store_si256((__m256i*)cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = oneEightPoints * 8; + for(;number < num_points; number++){ + cVector[number] = aVector[number] | bVector[number]; + } +} +#endif /* LV_HAVE_AVX2 */ + + #ifdef LV_HAVE_SSE #include @@ -185,3 +222,49 @@ #endif /* INCLUDED_volk_32i_x2_or_32i_a_H */ + + +#ifndef INCLUDED_volk_32i_x2_or_32i_u_H +#define INCLUDED_volk_32i_x2_or_32i_u_H + +#include +#include + +#ifdef LV_HAVE_AVX2 +#include + +static inline void +volk_32i_x2_or_32i_u_avx2(int32_t* cVector, const int32_t* aVector, + const int32_t* bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int oneEightPoints = num_points / 8; + + int32_t* cPtr = cVector; + const int32_t* aPtr = aVector; + const int32_t* bPtr = bVector; + + __m256i aVal, bVal, cVal; + for(;number < oneEightPoints; number++){ + + aVal = _mm256_loadu_si256((__m256i*)aPtr); + bVal = _mm256_loadu_si256((__m256i*)bPtr); + + cVal = _mm256_or_si256(aVal, bVal); + + _mm256_storeu_si256((__m256i*)cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = oneEightPoints * 8; + for(;number < num_points; number++){ + cVector[number] = aVector[number] | bVector[number]; + } +} +#endif /* LV_HAVE_AVX2 */ + + +#endif /* INCLUDED_volk_32i_x2_or_32i_u_H */ diff -Nru volk-1.3/kernels/volk/volk_32u_byteswap.h volk-1.4/kernels/volk/volk_32u_byteswap.h --- volk-1.3/kernels/volk/volk_32u_byteswap.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32u_byteswap.h 2018-03-26 22:52:55.000000000 +0000 @@ -162,7 +162,7 @@ uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67; uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67; - /* these magic numbers are used as byte-indeces in the LUT. + /* these magic numbers are used as byte-indices in the LUT. they are pre-computed to save time. A simple C program can calculate them; for example for lookup01: uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1}; diff -Nru volk-1.3/kernels/volk/volk_32u_reverse_32u.h volk-1.4/kernels/volk/volk_32u_reverse_32u.h --- volk-1.3/kernels/volk/volk_32u_reverse_32u.h 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/kernels/volk/volk_32u_reverse_32u.h 2018-03-26 22:52:55.000000000 +0000 @@ -0,0 +1,367 @@ +/* -*- c++ -*- */ +/* + Copyright (C) 2018 Free Software Foundation, Inc. + + This file is pat of libVOLK + + All rights reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License version 2.1, as + published by the Free Software Foundation. This program is + distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program; if not, see . +*/ + +/*! + * \page volk_32u_reverse_32u + * + * \b bit reversal of the input 32 bit word + + * Dispatcher Prototype + * \code volk_32u_reverse_32u(uint32_t *outputVector, uint32_t *inputVector; unsigned int num_points); + * \endcode + * + * \b Inputs + * \li inputVector: The input vector + * \li num_points The number of data points. + * + * \b Outputs + * \li outputVector: The vector where the results will be stored, which is the bit-reversed input + * + * \endcode + */ +#ifndef INCLUDED_VOLK_32u_REVERSE_32u_U_H +struct dword_split { + int b00: 1; + int b01: 1; + int b02: 1; + int b03: 1; + int b04: 1; + int b05: 1; + int b06: 1; + int b07: 1; + int b08: 1; + int b09: 1; + int b10: 1; + int b11: 1; + int b12: 1; + int b13: 1; + int b14: 1; + int b15: 1; + int b16: 1; + int b17: 1; + int b18: 1; + int b19: 1; + int b20: 1; + int b21: 1; + int b22: 1; + int b23: 1; + int b24: 1; + int b25: 1; + int b26: 1; + int b27: 1; + int b28: 1; + int b29: 1; + int b30: 1; + int b31: 1; +}; +struct char_split { + uint8_t b00: 1; + uint8_t b01: 1; + uint8_t b02: 1; + uint8_t b03: 1; + uint8_t b04: 1; + uint8_t b05: 1; + uint8_t b06: 1; + uint8_t b07: 1; +}; + +//Idea from "Bit Twiddling Hacks", which dedicates this method to public domain +//http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable +static const unsigned char BitReverseTable256[] = { + 0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, + 0xB0, 0x70, 0xF0, 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, + 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8, 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, + 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 0x0C, 0x8C, 0x4C, 0xCC, + 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 0x02, + 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, + 0x72, 0xF2, 0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, + 0xDA, 0x3A, 0xBA, 0x7A, 0xFA, 0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, + 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6, 0x0E, 0x8E, 0x4E, 0xCE, 0x2E, + 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE, 0x01, 0x81, + 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, + 0xF1, 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, + 0x39, 0xB9, 0x79, 0xF9, 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, + 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, + 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, 0x03, 0x83, 0x43, + 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, + 0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, + 0xBB, 0x7B, 0xFB, 0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, + 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7, 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, + 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF +}; +#ifdef LV_HAVE_GENERIC +static inline void volk_32u_reverse_32u_dword_shuffle(uint32_t* out, const uint32_t* in, + unsigned int num_points) +{ + const struct dword_split *in_ptr = (const struct dword_split*)in; + struct dword_split * out_ptr = (struct dword_split*)out; + unsigned int number = 0; + for(; number < num_points; ++number){ + out_ptr->b00 = in_ptr->b31; + out_ptr->b01 = in_ptr->b30; + out_ptr->b02 = in_ptr->b29; + out_ptr->b03 = in_ptr->b28; + out_ptr->b04 = in_ptr->b27; + out_ptr->b05 = in_ptr->b26; + out_ptr->b06 = in_ptr->b25; + out_ptr->b07 = in_ptr->b24; + out_ptr->b08 = in_ptr->b23; + out_ptr->b09 = in_ptr->b22; + out_ptr->b10 = in_ptr->b21; + out_ptr->b11 = in_ptr->b20; + out_ptr->b12 = in_ptr->b19; + out_ptr->b13 = in_ptr->b18; + out_ptr->b14 = in_ptr->b17; + out_ptr->b15 = in_ptr->b16; + out_ptr->b16 = in_ptr->b15; + out_ptr->b17 = in_ptr->b14; + out_ptr->b18 = in_ptr->b13; + out_ptr->b19 = in_ptr->b12; + out_ptr->b20 = in_ptr->b11; + out_ptr->b21 = in_ptr->b10; + out_ptr->b22 = in_ptr->b09; + out_ptr->b23 = in_ptr->b08; + out_ptr->b24 = in_ptr->b07; + out_ptr->b25 = in_ptr->b06; + out_ptr->b26 = in_ptr->b05; + out_ptr->b27 = in_ptr->b04; + out_ptr->b28 = in_ptr->b03; + out_ptr->b29 = in_ptr->b02; + out_ptr->b30 = in_ptr->b01; + out_ptr->b31 = in_ptr->b00; + ++in_ptr; + ++out_ptr; + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_GENERIC +static inline void volk_32u_reverse_32u_byte_shuffle(uint32_t* out, const uint32_t* in, + unsigned int num_points) +{ + const uint32_t *in_ptr = in; + uint32_t *out_ptr = out; + unsigned int number = 0; + for(; number < num_points; ++number){ + const struct char_split *in8 = (const struct char_split*)in_ptr; + struct char_split *out8 = (struct char_split*)out_ptr; + + out8[3].b00 = in8[0].b07; + out8[3].b01 = in8[0].b06; + out8[3].b02 = in8[0].b05; + out8[3].b03 = in8[0].b04; + out8[3].b04 = in8[0].b03; + out8[3].b05 = in8[0].b02; + out8[3].b06 = in8[0].b01; + out8[3].b07 = in8[0].b00; + + out8[2].b00 = in8[1].b07; + out8[2].b01 = in8[1].b06; + out8[2].b02 = in8[1].b05; + out8[2].b03 = in8[1].b04; + out8[2].b04 = in8[1].b03; + out8[2].b05 = in8[1].b02; + out8[2].b06 = in8[1].b01; + out8[2].b07 = in8[1].b00; + + out8[1].b00 = in8[2].b07; + out8[1].b01 = in8[2].b06; + out8[1].b02 = in8[2].b05; + out8[1].b03 = in8[2].b04; + out8[1].b04 = in8[2].b03; + out8[1].b05 = in8[2].b02; + out8[1].b06 = in8[2].b01; + out8[1].b07 = in8[2].b00; + + out8[0].b00 = in8[3].b07; + out8[0].b01 = in8[3].b06; + out8[0].b02 = in8[3].b05; + out8[0].b03 = in8[3].b04; + out8[0].b04 = in8[3].b03; + out8[0].b05 = in8[3].b02; + out8[0].b06 = in8[3].b01; + out8[0].b07 = in8[3].b00; + ++in_ptr; + ++out_ptr; + } +} +#endif /* LV_HAVE_GENERIC */ + +//Idea from "Bit Twiddling Hacks", which dedicates this method to public domain +//http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable +#ifdef LV_HAVE_GENERIC +static inline void volk_32u_reverse_32u_lut(uint32_t* out, const uint32_t* in, + unsigned int num_points) +{ + const uint32_t *in_ptr = in; + uint32_t *out_ptr = out; + unsigned int number = 0; + for(; number < num_points; ++number){ + *out_ptr = + (BitReverseTable256[*in_ptr & 0xff] << 24) | + (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) | + (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) | + (BitReverseTable256[(*in_ptr >> 24) & 0xff]); + ++in_ptr; + ++out_ptr; + } +} +#endif /* LV_HAVE_GENERIC */ + +//Single-Byte code from "Bit Twiddling Hacks", which dedicates this method to public domain +//http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits +#ifdef LV_HAVE_GENERIC +static inline void volk_32u_reverse_32u_2001magic(uint32_t* out, const uint32_t* in, + unsigned int num_points) +{ + const uint32_t *in_ptr = in; + uint32_t *out_ptr = out; + const uint8_t *in8; + uint8_t *out8; + unsigned int number = 0; + for(; number < num_points; ++number){ + in8 = (const uint8_t*)in_ptr; + out8 = (uint8_t*)out_ptr; + out8[3] = ((in8[0] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32; + out8[2] = ((in8[1] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32; + out8[1] = ((in8[2] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32; + out8[0] = ((in8[3] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32; + ++in_ptr; + ++out_ptr; + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_GENERIC +// Current gr-pager implementation +static inline void volk_32u_reverse_32u_1972magic(uint32_t* out, const uint32_t* in, + unsigned int num_points) +{ + const uint32_t *in_ptr = in; + uint32_t *out_ptr = out; + const uint8_t *in8; + uint8_t *out8; + unsigned int number = 0; + for(; number < num_points; ++number){ + in8 = (const uint8_t*)in_ptr; + out8 = (uint8_t*)out_ptr; + out8[3] = (in8[0] * 0x0202020202ULL & 0x010884422010ULL) % 1023; + out8[2] = (in8[1] * 0x0202020202ULL & 0x010884422010ULL) % 1023; + out8[1] = (in8[2] * 0x0202020202ULL & 0x010884422010ULL) % 1023; + out8[0] = (in8[3] * 0x0202020202ULL & 0x010884422010ULL) % 1023; + ++in_ptr; + ++out_ptr; + } +} +#endif /* LV_HAVE_GENERIC */ + +//After lengthy thought and quite a bit of whiteboarding: +#ifdef LV_HAVE_GENERIC +static inline void volk_32u_reverse_32u_bintree_permute_top_down(uint32_t* out, const uint32_t* in, + unsigned int num_points) +{ + const uint32_t *in_ptr = in; + uint32_t *out_ptr = out; + unsigned int number = 0; + for(; number < num_points; ++number){ + uint32_t tmp = *in_ptr; + /* permute uint16: + The idea is to simply shift the lower 16 bit up, and the upper 16 bit down. + */ + tmp = ( tmp << 16 ) | ( tmp >> 16 ); + /* permute bytes: + shift up by 1 B first, then only consider even bytes, and OR with the unshifted even bytes + */ + tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16)); + /* permute 4bit tuples: + Same idea, but the "consideration" mask expression becomes unwieldy + */ + tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) | ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)); + /* permute 2bit tuples: + Here, we collapsed the "consideration" mask to a simple hexmask: 0b0011 = + 3; we need those every 4b, which coincides with a hex digit! + */ + tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333)); + /* permute odd/even: + 0x01 = 0x1; we need these every 2b, which works out: 0x01 | (0x01 << 2) = 0x05! + */ + tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555)); + + *out_ptr = tmp; + ++in_ptr; + ++out_ptr; + } +} +#endif /* LV_HAVE_GENERIC */ +#ifdef LV_HAVE_GENERIC +static inline void volk_32u_reverse_32u_bintree_permute_bottom_up(uint32_t* out, const uint32_t* in, + unsigned int num_points) +{ + //same stuff as top_down, inverted order (permutation matrices don't care, you know!) + const uint32_t *in_ptr = in; + uint32_t *out_ptr = out; + unsigned int number = 0; + for(; number < num_points; ++number){ + uint32_t tmp = *in_ptr; + tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555)); + tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333)); + tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) | ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)); + tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16)); + tmp = ( tmp << 16 ) | ( tmp >> 16 ); + + *out_ptr = tmp; + ++in_ptr; + ++out_ptr; + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_NEON +#include + +#define DO_RBIT \ + asm("rbit %1,%0" : "=r" (*out_ptr) : "r" (*in_ptr)); \ + in_ptr++; \ + out_ptr++; + +static inline void volk_32u_reverse_32u_arm(uint32_t* out, const uint32_t* in, + unsigned int num_points) +{ + + const uint32_t *in_ptr = in; + uint32_t *out_ptr = out; + const unsigned int eighthPoints = num_points/8; + unsigned int number = 0; + for(; number < eighthPoints; ++number){ + __VOLK_PREFETCH(in_ptr+8); + DO_RBIT; DO_RBIT; DO_RBIT; DO_RBIT; + DO_RBIT; DO_RBIT; DO_RBIT; DO_RBIT; + } + number = eighthPoints*8; + for(; number < num_points; ++number){ + DO_RBIT; + } +} +#undef DO_RBIT +#endif /* LV_HAVE_NEON */ + + +#endif /* INCLUDED_volk_32u_reverse_32u_u_H */ + diff -Nru volk-1.3/kernels/volk/volk_64f_convert_32f.h volk-1.4/kernels/volk/volk_64f_convert_32f.h --- volk-1.3/kernels/volk/volk_64f_convert_32f.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_64f_convert_32f.h 2018-03-26 22:52:55.000000000 +0000 @@ -67,6 +67,41 @@ #include #include +#ifdef LV_HAVE_AVX +#include + +static inline void volk_64f_convert_32f_u_avx(float* outputVector, const double* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int oneEightPoints = num_points / 8; + + const double* inputVectorPtr = (const double*)inputVector; + float* outputVectorPtr = outputVector; + __m128 ret1, ret2; + __m256d inputVal1, inputVal2; + + for(;number < oneEightPoints; number++){ + inputVal1 = _mm256_loadu_pd(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm256_loadu_pd(inputVectorPtr); inputVectorPtr += 4; + + ret1 = _mm256_cvtpd_ps(inputVal1); + ret2 = _mm256_cvtpd_ps(inputVal2); + + _mm_storeu_ps(outputVectorPtr, ret1); + outputVectorPtr += 4; + + _mm_storeu_ps(outputVectorPtr, ret2); + outputVectorPtr += 4; + } + + number = oneEightPoints * 8; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]); + } +} +#endif /* LV_HAVE_AVX */ + + #ifdef LV_HAVE_SSE2 #include @@ -124,6 +159,41 @@ #include #include +#ifdef LV_HAVE_AVX +#include + +static inline void volk_64f_convert_32f_a_avx(float* outputVector, const double* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int oneEightPoints = num_points / 8; + + const double* inputVectorPtr = (const double*)inputVector; + float* outputVectorPtr = outputVector; + __m128 ret1, ret2; + __m256d inputVal1, inputVal2; + + for(;number < oneEightPoints; number++){ + inputVal1 = _mm256_load_pd(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm256_load_pd(inputVectorPtr); inputVectorPtr += 4; + + ret1 = _mm256_cvtpd_ps(inputVal1); + ret2 = _mm256_cvtpd_ps(inputVal2); + + _mm_store_ps(outputVectorPtr, ret1); + outputVectorPtr += 4; + + _mm_store_ps(outputVectorPtr, ret2); + outputVectorPtr += 4; + } + + number = oneEightPoints * 8; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]); + } +} +#endif /* LV_HAVE_AVX */ + + #ifdef LV_HAVE_SSE2 #include diff -Nru volk-1.3/kernels/volk/volk_64f_x2_add_64f.h volk-1.4/kernels/volk/volk_64f_x2_add_64f.h --- volk-1.3/kernels/volk/volk_64f_x2_add_64f.h 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/kernels/volk/volk_64f_x2_add_64f.h 2018-03-26 22:52:55.000000000 +0000 @@ -0,0 +1,255 @@ +/* -*- c++ -*- */ +/* + * Copyright 2018 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +/*! + * \page volk_64f_x2_add_64f + * + * \b Overview + * + * addtiplies two input double-precision floating point vectors together. + * + * c[i] = a[i] * b[i] + * + * Dispatcher Prototype + * \code + * void volk_64f_x2_add_64f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points) + * \endcode + * + * \b Inputs + * \li aVector: First input vector. + * \li bVector: Second input vector. + * \li num_points: The number of values in both input vectors. + * + * \b Outputs + * \li cVector: The output vector. + * + * \b Example + * add elements of an increasing vector by those of a decreasing vector. + * \code + * int N = 10; + * unsigned int alignment = volk_get_alignment(); + * double* increasing = (double*)volk_malloc(sizeof(double)*N, alignment); + * double* decreasing = (double*)volk_malloc(sizeof(double)*N, alignment); + * double* out = (double*)volk_malloc(sizeof(double)*N, alignment); + * + * for(unsigned int ii = 0; ii < N; ++ii){ + * increasing[ii] = (float)ii; + * decreasing[ii] = 10.f - (float)ii; + * } + * + * volk_64f_x2_add_64f(out, increasing, decreasing, N); + * + * for(unsigned int ii = 0; ii < N; ++ii){ + * printf("out[%u] = %1.2F\n", ii, out[ii]); + * } + * + * volk_free(increasing); + * volk_free(decreasing); + * volk_free(out); + * \endcode + */ + +#ifndef INCLUDED_volk_64f_x2_add_64f_H +#define INCLUDED_volk_64f_x2_add_64f_H + +#include + + +#ifdef LV_HAVE_GENERIC + +static inline void +volk_64f_x2_add_64f_generic(double *cVector, const double *aVector, + const double *bVector, unsigned int num_points) +{ + double *cPtr = cVector; + const double *aPtr = aVector; + const double *bPtr = bVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} + +#endif /* LV_HAVE_GENERIC */ + +/* + * Unaligned versions + */ + +#ifdef LV_HAVE_SSE2 + +#include + +static inline void +volk_64f_x2_add_64f_u_sse2(double *cVector, const double *aVector, + const double *bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int half_points = num_points / 2; + + double *cPtr = cVector; + const double *aPtr = aVector; + const double *bPtr = bVector; + + __m128d aVal, bVal, cVal; + for (; number < half_points; number++) { + aVal = _mm_loadu_pd(aPtr); + bVal = _mm_loadu_pd(bPtr); + + cVal = _mm_add_pd(aVal, bVal); + + _mm_storeu_pd(cPtr, cVal); // Store the results back into the C container + + aPtr += 2; + bPtr += 2; + cPtr += 2; + } + + number = half_points * 2; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} + +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_AVX + +#include + +static inline void +volk_64f_x2_add_64f_u_avx(double *cVector, const double *aVector, + const double *bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int quarter_points = num_points / 4; + + double *cPtr = cVector; + const double *aPtr = aVector; + const double *bPtr = bVector; + + __m256d aVal, bVal, cVal; + for (; number < quarter_points; number++) { + + aVal = _mm256_loadu_pd(aPtr); + bVal = _mm256_loadu_pd(bPtr); + + cVal = _mm256_add_pd(aVal, bVal); + + _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarter_points * 4; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} + +#endif /* LV_HAVE_AVX */ + +/* + * Aligned versions + */ + +#ifdef LV_HAVE_SSE2 + +#include + +static inline void +volk_64f_x2_add_64f_a_sse2(double *cVector, const double *aVector, + const double *bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int half_points = num_points / 2; + + double *cPtr = cVector; + const double *aPtr = aVector; + const double *bPtr = bVector; + + __m128d aVal, bVal, cVal; + for (; number < half_points; number++) { + aVal = _mm_load_pd(aPtr); + bVal = _mm_load_pd(bPtr); + + cVal = _mm_add_pd(aVal, bVal); + + _mm_store_pd(cPtr, cVal); // Store the results back into the C container + + aPtr += 2; + bPtr += 2; + cPtr += 2; + } + + number = half_points * 2; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} + +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_AVX + +#include + +static inline void +volk_64f_x2_add_64f_a_avx(double *cVector, const double *aVector, + const double *bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int quarter_points = num_points / 4; + + double *cPtr = cVector; + const double *aPtr = aVector; + const double *bPtr = bVector; + + __m256d aVal, bVal, cVal; + for (; number < quarter_points; number++) { + + aVal = _mm256_load_pd(aPtr); + bVal = _mm256_load_pd(bPtr); + + cVal = _mm256_add_pd(aVal, bVal); + + _mm256_store_pd(cPtr, cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarter_points * 4; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} + +#endif /* LV_HAVE_AVX */ + +#endif /* INCLUDED_volk_64f_x2_add_64f_u_H */ diff -Nru volk-1.3/kernels/volk/volk_64f_x2_max_64f.h volk-1.4/kernels/volk/volk_64f_x2_max_64f.h --- volk-1.3/kernels/volk/volk_64f_x2_max_64f.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_64f_x2_max_64f.h 2018-03-26 22:52:55.000000000 +0000 @@ -74,6 +74,45 @@ #include #include +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_64f_x2_max_64f_a_avx(double* cVector, const double* aVector, + const double* bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + + __m256d aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm256_load_pd(aPtr); + bVal = _mm256_load_pd(bPtr); + + cVal = _mm256_max_pd(aVal, bVal); + + _mm256_store_pd(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_AVX */ + + #ifdef LV_HAVE_SSE2 #include @@ -134,3 +173,51 @@ #endif /* INCLUDED_volk_64f_x2_max_64f_a_H */ + + +#ifndef INCLUDED_volk_64f_x2_max_64f_u_H +#define INCLUDED_volk_64f_x2_max_64f_u_H + +#include +#include + +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_64f_x2_max_64f_u_avx(double* cVector, const double* aVector, + const double* bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + + __m256d aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm256_loadu_pd(aPtr); + bVal = _mm256_loadu_pd(bPtr); + + cVal = _mm256_max_pd(aVal, bVal); + + _mm256_storeu_pd(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_AVX */ + + +#endif /* INCLUDED_volk_64f_x2_max_64f_u_H */ diff -Nru volk-1.3/kernels/volk/volk_64f_x2_min_64f.h volk-1.4/kernels/volk/volk_64f_x2_min_64f.h --- volk-1.3/kernels/volk/volk_64f_x2_min_64f.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_64f_x2_min_64f.h 2018-03-26 22:52:55.000000000 +0000 @@ -74,6 +74,45 @@ #include #include +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_64f_x2_min_64f_a_avx(double* cVector, const double* aVector, + const double* bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + + __m256d aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm256_load_pd(aPtr); + bVal = _mm256_load_pd(bPtr); + + cVal = _mm256_min_pd(aVal, bVal); + + _mm256_store_pd(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_AVX */ + + #ifdef LV_HAVE_SSE2 #include @@ -134,3 +173,50 @@ #endif /* INCLUDED_volk_64f_x2_min_64f_a_H */ + +#ifndef INCLUDED_volk_64f_x2_min_64f_u_H +#define INCLUDED_volk_64f_x2_min_64f_u_H + +#include +#include + +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_64f_x2_min_64f_u_avx(double* cVector, const double* aVector, + const double* bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + + __m256d aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm256_loadu_pd(aPtr); + bVal = _mm256_loadu_pd(bPtr); + + cVal = _mm256_min_pd(aVal, bVal); + + _mm256_storeu_pd(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_AVX */ + + +#endif /* INCLUDED_volk_64f_x2_min_64f_u_H */ diff -Nru volk-1.3/kernels/volk/volk_64f_x2_multiply_64f.h volk-1.4/kernels/volk/volk_64f_x2_multiply_64f.h --- volk-1.3/kernels/volk/volk_64f_x2_multiply_64f.h 1970-01-01 00:00:00.000000000 +0000 +++ volk-1.4/kernels/volk/volk_64f_x2_multiply_64f.h 2018-03-26 22:52:55.000000000 +0000 @@ -0,0 +1,255 @@ +/* -*- c++ -*- */ +/* + * Copyright 2018 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +/*! + * \page volk_64f_x2_multiply_64f + * + * \b Overview + * + * Multiplies two input double-precision floating point vectors together. + * + * c[i] = a[i] * b[i] + * + * Dispatcher Prototype + * \code + * void volk_64f_x2_multiply_64f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points) + * \endcode + * + * \b Inputs + * \li aVector: First input vector. + * \li bVector: Second input vector. + * \li num_points: The number of values in both input vectors. + * + * \b Outputs + * \li cVector: The output vector. + * + * \b Example + * Multiply elements of an increasing vector by those of a decreasing vector. + * \code + * int N = 10; + * unsigned int alignment = volk_get_alignment(); + * double* increasing = (double*)volk_malloc(sizeof(double)*N, alignment); + * double* decreasing = (double*)volk_malloc(sizeof(double)*N, alignment); + * double* out = (double*)volk_malloc(sizeof(double)*N, alignment); + * + * for(unsigned int ii = 0; ii < N; ++ii){ + * increasing[ii] = (float)ii; + * decreasing[ii] = 10.f - (float)ii; + * } + * + * volk_64f_x2_multiply_64f(out, increasing, decreasing, N); + * + * for(unsigned int ii = 0; ii < N; ++ii){ + * printf("out[%u] = %1.2F\n", ii, out[ii]); + * } + * + * volk_free(increasing); + * volk_free(decreasing); + * volk_free(out); + * \endcode + */ + +#ifndef INCLUDED_volk_64f_x2_multiply_64f_H +#define INCLUDED_volk_64f_x2_multiply_64f_H + +#include + + +#ifdef LV_HAVE_GENERIC + +static inline void +volk_64f_x2_multiply_64f_generic(double *cVector, const double *aVector, + const double *bVector, unsigned int num_points) +{ + double *cPtr = cVector; + const double *aPtr = aVector; + const double *bPtr = bVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} + +#endif /* LV_HAVE_GENERIC */ + +/* + * Unaligned versions + */ + +#ifdef LV_HAVE_SSE2 + +#include + +static inline void +volk_64f_x2_multiply_64f_u_sse2(double *cVector, const double *aVector, + const double *bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int half_points = num_points / 2; + + double *cPtr = cVector; + const double *aPtr = aVector; + const double *bPtr = bVector; + + __m128d aVal, bVal, cVal; + for (; number < half_points; number++) { + aVal = _mm_loadu_pd(aPtr); + bVal = _mm_loadu_pd(bPtr); + + cVal = _mm_mul_pd(aVal, bVal); + + _mm_storeu_pd(cPtr, cVal); // Store the results back into the C container + + aPtr += 2; + bPtr += 2; + cPtr += 2; + } + + number = half_points * 2; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} + +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_AVX + +#include + +static inline void +volk_64f_x2_multiply_64f_u_avx(double *cVector, const double *aVector, + const double *bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int quarter_points = num_points / 4; + + double *cPtr = cVector; + const double *aPtr = aVector; + const double *bPtr = bVector; + + __m256d aVal, bVal, cVal; + for (; number < quarter_points; number++) { + + aVal = _mm256_loadu_pd(aPtr); + bVal = _mm256_loadu_pd(bPtr); + + cVal = _mm256_mul_pd(aVal, bVal); + + _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarter_points * 4; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} + +#endif /* LV_HAVE_AVX */ + +/* + * Aligned versions + */ + +#ifdef LV_HAVE_SSE2 + +#include + +static inline void +volk_64f_x2_multiply_64f_a_sse2(double *cVector, const double *aVector, + const double *bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int half_points = num_points / 2; + + double *cPtr = cVector; + const double *aPtr = aVector; + const double *bPtr = bVector; + + __m128d aVal, bVal, cVal; + for (; number < half_points; number++) { + aVal = _mm_load_pd(aPtr); + bVal = _mm_load_pd(bPtr); + + cVal = _mm_mul_pd(aVal, bVal); + + _mm_store_pd(cPtr, cVal); // Store the results back into the C container + + aPtr += 2; + bPtr += 2; + cPtr += 2; + } + + number = half_points * 2; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} + +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_AVX + +#include + +static inline void +volk_64f_x2_multiply_64f_a_avx(double *cVector, const double *aVector, + const double *bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int quarter_points = num_points / 4; + + double *cPtr = cVector; + const double *aPtr = aVector; + const double *bPtr = bVector; + + __m256d aVal, bVal, cVal; + for (; number < quarter_points; number++) { + + aVal = _mm256_load_pd(aPtr); + bVal = _mm256_load_pd(bPtr); + + cVal = _mm256_mul_pd(aVal, bVal); + + _mm256_store_pd(cPtr, cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarter_points * 4; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} + +#endif /* LV_HAVE_AVX */ + +#endif /* INCLUDED_volk_64f_x2_multiply_64f_u_H */ diff -Nru volk-1.3/kernels/volk/volk_64u_byteswap.h volk-1.4/kernels/volk/volk_64u_byteswap.h --- volk-1.3/kernels/volk/volk_64u_byteswap.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_64u_byteswap.h 2018-03-26 22:52:55.000000000 +0000 @@ -250,7 +250,7 @@ uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67; uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67; - /* these magic numbers are used as byte-indeces in the LUT. + /* these magic numbers are used as byte-indices in the LUT. they are pre-computed to save time. A simple C program can calculate them; for example for lookup01: uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1}; diff -Nru volk-1.3/kernels/volk/volk_64u_popcnt.h volk-1.4/kernels/volk/volk_64u_popcnt.h --- volk-1.3/kernels/volk/volk_64u_popcnt.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_64u_popcnt.h 2018-03-26 22:52:55.000000000 +0000 @@ -84,7 +84,7 @@ uint64_t retVal64 = retVal; //retVal = valueVector[1]; - retVal = (uint32_t)((value & 0xFFFFFFFF00000000ull) >> 31); + retVal = (uint32_t)((value & 0xFFFFFFFF00000000ull) >> 32); retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; diff -Nru volk-1.3/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h volk-1.4/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h --- volk-1.3/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h 2018-03-26 22:52:55.000000000 +0000 @@ -187,6 +187,77 @@ #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_AVX2 +#include + +static inline void +volk_8ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, + const float scalar, unsigned int num_points) +{ + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + __m256 iFloatValue, qFloatValue; + + const float iScalar= 1.0 / scalar; + __m256 invScalar = _mm256_set1_ps(iScalar); + __m256i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m256i iMoveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 14, 12, 10, 8, 6, 4, 2, 0, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 14, 12, 10, 8, 6, 4, 2, 0); + __m256i qMoveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 15, 13, 11, 9, 7, 5, 3, 1, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 15, 13, 11, 9, 7, 5, 3, 1); + + for(;number < sixteenthPoints; number++){ + complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 32; + iComplexVal = _mm256_shuffle_epi8(complexVal, iMoveMask); + qComplexVal = _mm256_shuffle_epi8(complexVal, qMoveMask); + + iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal)); + iFloatValue = _mm256_cvtepi32_ps(iIntVal); + iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); + _mm256_store_ps(iBufferPtr, iFloatValue); + iBufferPtr += 8; + + iComplexVal = _mm256_permute4x64_epi64(iComplexVal, 0b11000110); + iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal)); + iFloatValue = _mm256_cvtepi32_ps(iIntVal); + iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); + _mm256_store_ps(iBufferPtr, iFloatValue); + iBufferPtr += 8; + + qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal)); + qFloatValue = _mm256_cvtepi32_ps(qIntVal); + qFloatValue = _mm256_mul_ps(qFloatValue, invScalar); + _mm256_store_ps(qBufferPtr, qFloatValue); + qBufferPtr += 8; + + qComplexVal = _mm256_permute4x64_epi64(qComplexVal, 0b11000110); + qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal)); + qFloatValue = _mm256_cvtepi32_ps(qIntVal); + qFloatValue = _mm256_mul_ps(qFloatValue, invScalar); + _mm256_store_ps(qBufferPtr, qFloatValue); + qBufferPtr += 8; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + } + +} +#endif /* LV_HAVE_AVX2 */ + + #ifdef LV_HAVE_GENERIC static inline void @@ -207,6 +278,4 @@ #endif /* LV_HAVE_GENERIC */ - - #endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H */ diff -Nru volk-1.3/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h volk-1.4/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h --- volk-1.3/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h 2018-03-26 22:52:55.000000000 +0000 @@ -59,6 +59,57 @@ #include #include +#ifdef LV_HAVE_AVX2 +#include + +static inline void +volk_8ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer, const lv_8sc_t* complexVector, + const float scalar, unsigned int num_points) +{ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + __m256 iFloatValue; + + const float iScalar= 1.0 / scalar; + __m256 invScalar = _mm256_set1_ps(iScalar); + __m256i complexVal, iIntVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 14, 12, 10, 8, 6, 4, 2, 0, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 14, 12, 10, 8, 6, 4, 2, 0); + for(;number < sixteenthPoints; number++){ + complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 32; + complexVal = _mm256_shuffle_epi8(complexVal, moveMask); + + iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(complexVal)); + iFloatValue = _mm256_cvtepi32_ps(iIntVal); + iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); + _mm256_store_ps(iBufferPtr, iFloatValue); + iBufferPtr += 8; + + complexVal = _mm256_permute4x64_epi64(complexVal, 0b11000110); + iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(complexVal)); + iFloatValue = _mm256_cvtepi32_ps(iIntVal); + iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); + _mm256_store_ps(iBufferPtr, iFloatValue); + iBufferPtr += 8; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_AVX2 */ + + #ifdef LV_HAVE_SSE4_1 #include diff -Nru volk-1.3/kernels/volk/volk_8i_s32f_convert_32f.h volk-1.4/kernels/volk/volk_8i_s32f_convert_32f.h --- volk-1.3/kernels/volk/volk_8i_s32f_convert_32f.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/kernels/volk/volk_8i_s32f_convert_32f.h 2018-03-26 22:52:55.000000000 +0000 @@ -115,7 +115,6 @@ } #endif /* LV_HAVE_SSE4_1 */ - #ifdef LV_HAVE_GENERIC static inline void @@ -200,6 +199,62 @@ } #endif /* LV_HAVE_SSE4_1 */ +#ifdef LV_HAVE_NEON +#include + +static inline void +volk_8i_s32f_convert_32f_neon(float* outputVector, const int8_t* inputVector, + const float scalar, unsigned int num_points) +{ + float* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + + const float iScalar = 1.0 / scalar; + const float32x4_t qiScalar = vdupq_n_f32(iScalar); + + int8x8x2_t inputVal; + float32x4x2_t outputFloat; + int16x8_t tmp; + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + for(;number < sixteenthPoints; number++){ + __VOLK_PREFETCH(inputVectorPtr+16); + + inputVal = vld2_s8(inputVectorPtr); + inputVal = vzip_s8(inputVal.val[0], inputVal.val[1]); + inputVectorPtr += 16; + + tmp = vmovl_s8(inputVal.val[0]); + + outputFloat.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp))); + outputFloat.val[0] = vmulq_f32(outputFloat.val[0], qiScalar); + vst1q_f32(outputVectorPtr, outputFloat.val[0]); + outputVectorPtr += 4; + + outputFloat.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp))); + outputFloat.val[1] = vmulq_f32(outputFloat.val[1], qiScalar); + vst1q_f32(outputVectorPtr, outputFloat.val[1]); + outputVectorPtr += 4; + + tmp = vmovl_s8(inputVal.val[1]); + + outputFloat.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp))); + outputFloat.val[0] = vmulq_f32(outputFloat.val[0], qiScalar); + vst1q_f32(outputVectorPtr, outputFloat.val[0]); + outputVectorPtr += 4; + + outputFloat.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp))); + outputFloat.val[1] = vmulq_f32(outputFloat.val[1], qiScalar); + vst1q_f32(outputVectorPtr, outputFloat.val[1]); + outputVectorPtr += 4; + } + for(number = sixteenthPoints * 16; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} + +#endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_GENERIC diff -Nru volk-1.3/lib/CMakeLists.txt volk-1.4/lib/CMakeLists.txt --- volk-1.3/lib/CMakeLists.txt 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/lib/CMakeLists.txt 2018-03-26 22:52:55.000000000 +0000 @@ -201,7 +201,7 @@ file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps ${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps.c) else(CMAKE_SIZEOF_VOID_P EQUAL 4) - # 64-bit compilations won't need this command so don't overrule AVX + # 64-bit compilations won't need this command so don't overrule AVX set(HAVE_AVX_CVTPI32_PS 0) endif(CMAKE_SIZEOF_VOID_P EQUAL 4) @@ -399,16 +399,6 @@ string(REPLACE "\n" " \\n" cmake_c_compiler_version ${cmake_c_compiler_version}) string(REPLACE "\n" " \\n" COMPILER_INFO ${COMPILER_INFO}) -######################################################################## -# Set local include directories first -######################################################################## -include_directories( - ${PROJECT_BINARY_DIR}/include - ${PROJECT_SOURCE_DIR}/include - ${PROJECT_SOURCE_DIR}/kernels - ${CMAKE_CURRENT_BINARY_DIR} - ${CMAKE_CURRENT_SOURCE_DIR} -) ######################################################################## # Handle ASM support @@ -489,7 +479,7 @@ message(STATUS "Loading version ${VERSION} into constants...") #double escape for windows backslash path separators -string(REPLACE "\\" "\\\\" prefix ${prefix}) +string(REPLACE "\\" "\\\\" prefix "${prefix}") configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/constants.c.in @@ -528,13 +518,28 @@ endif() #Use object library for faster overall build in newer versions of cmake -if(CMAKE_VERSION VERSION_GREATER "2.8.7") +if(CMAKE_VERSION VERSION_GREATER "2.8.11") #Create a volk object library (requires cmake >= 2.8.8) add_library(volk_obj OBJECT ${volk_sources}) + # a better cmake-fu user may make this more repeatable + target_include_directories(volk_obj + PUBLIC ${PROJECT_BINARY_DIR}/include + PUBLIC ${PROJECT_SOURCE_DIR}/include + PRIVATE ${PROJECT_SOURCE_DIR}/kernels + PRIVATE ${CMAKE_CURRENT_BINARY_DIR} + PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} + ) #Add dynamic library add_library(volk SHARED $) target_link_libraries(volk ${volk_libraries}) + target_include_directories(volk + PUBLIC ${PROJECT_BINARY_DIR}/include + PUBLIC ${PROJECT_SOURCE_DIR}/include + PRIVATE ${PROJECT_SOURCE_DIR}/kernels + PRIVATE ${CMAKE_CURRENT_BINARY_DIR} + PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} + ) #Configure target properties set_target_properties(volk_obj PROPERTIES COMPILE_FLAGS "-fPIC") @@ -551,6 +556,13 @@ #Configure static library if(ENABLE_STATIC_LIBS) add_library(volk_static STATIC $) + target_include_directories(volk_static + PUBLIC ${PROJECT_BINARY_DIR}/include + PUBLIC ${PROJECT_SOURCE_DIR}/include + PRIVATE ${PROJECT_SOURCE_DIR}/kernels + PRIVATE ${CMAKE_CURRENT_BINARY_DIR} + PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} + ) set_target_properties(volk_static PROPERTIES OUTPUT_NAME volk) @@ -558,12 +570,18 @@ ARCHIVE DESTINATION lib${LIB_SUFFIX} COMPONENT "volk_devel" ) endif(ENABLE_STATIC_LIBS) - #Older cmake versions (slower to build when building dynamic/static libs) else() #create the volk runtime library add_library(volk SHARED ${volk_sources}) target_link_libraries(volk ${volk_libraries}) + include_directories(volk + PUBLIC ${PROJECT_BINARY_DIR}/include + PUBLIC ${PROJECT_SOURCE_DIR}/include + PRIVATE ${PROJECT_SOURCE_DIR}/kernels + PRIVATE ${CMAKE_CURRENT_BINARY_DIR} + PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} + ) set_target_properties(volk PROPERTIES SOVERSION ${LIBVER}) set_target_properties(volk PROPERTIES DEFINE_SYMBOL "volk_EXPORTS") @@ -586,7 +604,7 @@ ) endif(ENABLE_STATIC_LIBS) -endif(CMAKE_VERSION VERSION_GREATER "2.8.7") +endif(CMAKE_VERSION VERSION_GREATER "2.8.11") ######################################################################## # Build the QA test application ######################################################################## @@ -601,10 +619,15 @@ ) include(VolkAddTest) - VOLK_ADD_TEST(test_all + VOLK_GEN_TEST("volk_test_all" SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/testqa.cc - ${CMAKE_CURRENT_SOURCE_DIR}/qa_utils.cc + ${CMAKE_CURRENT_SOURCE_DIR}/qa_utils.cc TARGET_DEPS volk - ) + ) + foreach(kernel ${h_files}) + get_filename_component(kernel ${kernel} NAME) + string(REPLACE ".h" "" kernel ${kernel}) + VOLK_ADD_TEST(${kernel} "volk_test_all") + endforeach() endif(ENABLE_TESTING) diff -Nru volk-1.3/lib/constants.c.in volk-1.4/lib/constants.c.in --- volk-1.3/lib/constants.c.in 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/lib/constants.c.in 2018-03-26 22:52:55.000000000 +0000 @@ -24,33 +24,36 @@ #include #endif +#include #include -char* +const char* volk_prefix() { + const char *prefix = getenv("VOLK_PREFIX"); + if (prefix != NULL) return prefix; return "@prefix@"; } -char* +const char* volk_version() { return "@VERSION@"; } -char* +const char* volk_c_compiler() { return "@cmake_c_compiler_version@"; } -char* +const char* volk_compiler_flags() { return "@COMPILER_INFO@"; } -char* +const char* volk_available_machines() { return "@available_machines@"; diff -Nru volk-1.3/lib/kernel_tests.h volk-1.4/lib/kernel_tests.h --- volk-1.3/lib/kernel_tests.h 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/lib/kernel_tests.h 2018-03-26 22:52:55.000000000 +0000 @@ -1,8 +1,6 @@ #include "qa_utils.h" #include - -#include #include // macros for initializing volk_test_case_t. Maccros are needed to generate @@ -18,129 +16,139 @@ volk_test_case_t(func##_get_func_desc(), (void(*)())func##_manual, std::string(#func),\ test_params) +#define QA(test) test_cases.push_back(test); std::vector init_test_list(volk_test_params_t test_params) { // Some kernels need a lower tolerance volk_test_params_t test_params_inacc = volk_test_params_t(1e-2, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()); + volk_test_params_t test_params_inacc_tenth = volk_test_params_t(1e-1, test_params.scalar(), + test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()); volk_test_params_t test_params_int1 = volk_test_params_t(1, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()); - std::vector test_cases = boost::assign::list_of - (VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params)) - - (VOLK_INIT_PUPP(volk_16u_byteswappuppet_16u, volk_16u_byteswap, test_params)) - (VOLK_INIT_PUPP(volk_32u_byteswappuppet_32u, volk_32u_byteswap, test_params)) - (VOLK_INIT_PUPP(volk_32u_popcntpuppet_32u, volk_32u_popcnt_32u, test_params)) - (VOLK_INIT_PUPP(volk_64u_byteswappuppet_64u, volk_64u_byteswap, test_params)) - (VOLK_INIT_PUPP(volk_32fc_s32fc_rotatorpuppet_32fc, volk_32fc_s32fc_x2_rotator_32fc, test_params)) - (VOLK_INIT_PUPP(volk_8u_conv_k7_r2puppet_8u, volk_8u_x4_conv_k7_r2_8u, volk_test_params_t(0, test_params.scalar(), test_params.vlen(), test_params.iter()/10, test_params.benchmark_mode(), test_params.kernel_regex()))) - (VOLK_INIT_PUPP(volk_32f_x2_fm_detectpuppet_32f, volk_32f_s32f_32f_fm_detect_32f, test_params)) - (VOLK_INIT_TEST(volk_16ic_s32f_deinterleave_real_32f, test_params)) - (VOLK_INIT_TEST(volk_16ic_deinterleave_real_8i, test_params)) - (VOLK_INIT_TEST(volk_16ic_deinterleave_16i_x2, test_params)) - (VOLK_INIT_TEST(volk_16ic_s32f_deinterleave_32f_x2, test_params)) - (VOLK_INIT_TEST(volk_16ic_deinterleave_real_16i, test_params)) - (VOLK_INIT_TEST(volk_16ic_magnitude_16i, test_params_int1)) - (VOLK_INIT_TEST(volk_16ic_s32f_magnitude_32f, test_params)) - (VOLK_INIT_TEST(volk_16ic_convert_32fc, test_params)) - (VOLK_INIT_TEST(volk_16ic_x2_multiply_16ic, test_params)) - (VOLK_INIT_TEST(volk_16ic_x2_dot_prod_16ic, test_params)) - (VOLK_INIT_TEST(volk_16i_s32f_convert_32f, test_params)) - (VOLK_INIT_TEST(volk_16i_convert_8i, test_params)) - (VOLK_INIT_TEST(volk_16i_32fc_dot_prod_32fc, test_params_inacc)) - (VOLK_INIT_TEST(volk_32f_accumulator_s32f, test_params_inacc)) - (VOLK_INIT_TEST(volk_32f_x2_add_32f, test_params)) - (VOLK_INIT_TEST(volk_32f_index_max_16u, test_params)) - (VOLK_INIT_TEST(volk_32f_index_max_32u, test_params)) - (VOLK_INIT_TEST(volk_32fc_32f_multiply_32fc, test_params)) - (VOLK_INIT_TEST(volk_32f_log2_32f, volk_test_params_t(3, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()))) - (VOLK_INIT_TEST(volk_32f_expfast_32f, volk_test_params_t(1e-1, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()))) - (VOLK_INIT_TEST(volk_32f_x2_pow_32f, volk_test_params_t(1e-2, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()))) - (VOLK_INIT_TEST(volk_32f_sin_32f, test_params_inacc)) - (VOLK_INIT_TEST(volk_32f_cos_32f, test_params_inacc)) - (VOLK_INIT_TEST(volk_32f_tan_32f, test_params_inacc)) - (VOLK_INIT_TEST(volk_32f_atan_32f, test_params_inacc)) - (VOLK_INIT_TEST(volk_32f_asin_32f, test_params_inacc)) - (VOLK_INIT_TEST(volk_32f_acos_32f, test_params_inacc)) - (VOLK_INIT_TEST(volk_32fc_s32f_power_32fc, test_params)) - (VOLK_INIT_TEST(volk_32f_s32f_calc_spectral_noise_floor_32f, test_params_inacc)) - (VOLK_INIT_TEST(volk_32fc_s32f_atan2_32f, test_params)) - (VOLK_INIT_TEST(volk_32fc_x2_conjugate_dot_prod_32fc, test_params_inacc)) - (VOLK_INIT_TEST(volk_32fc_deinterleave_32f_x2, test_params)) - (VOLK_INIT_TEST(volk_32fc_deinterleave_64f_x2, test_params)) - (VOLK_INIT_TEST(volk_32fc_s32f_deinterleave_real_16i, test_params)) - (VOLK_INIT_TEST(volk_32fc_deinterleave_imag_32f, test_params)) - (VOLK_INIT_TEST(volk_32fc_deinterleave_real_32f, test_params)) - (VOLK_INIT_TEST(volk_32fc_deinterleave_real_64f, test_params)) - (VOLK_INIT_TEST(volk_32fc_x2_dot_prod_32fc, test_params_inacc)) - (VOLK_INIT_TEST(volk_32fc_32f_dot_prod_32fc, test_params_inacc)) - (VOLK_INIT_TEST(volk_32fc_index_max_16u, volk_test_params_t(3, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()))) - (VOLK_INIT_TEST(volk_32fc_index_max_32u, volk_test_params_t(3, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()))) - (VOLK_INIT_TEST(volk_32fc_s32f_magnitude_16i, test_params_int1)) - (VOLK_INIT_TEST(volk_32fc_magnitude_32f, test_params_inacc)) - (VOLK_INIT_TEST(volk_32fc_magnitude_squared_32f, test_params)) - (VOLK_INIT_TEST(volk_32fc_x2_multiply_32fc, test_params)) - (VOLK_INIT_TEST(volk_32fc_x2_multiply_conjugate_32fc, test_params)) - (VOLK_INIT_TEST(volk_32fc_x2_divide_32fc, test_params)) - (VOLK_INIT_TEST(volk_32fc_conjugate_32fc, test_params)) - (VOLK_INIT_TEST(volk_32f_s32f_convert_16i, test_params)) - (VOLK_INIT_TEST(volk_32f_s32f_convert_32i, volk_test_params_t(1, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()))) - (VOLK_INIT_TEST(volk_32f_convert_64f, test_params)) - (VOLK_INIT_TEST(volk_32f_s32f_convert_8i, volk_test_params_t(1, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()))) - (VOLK_INIT_TEST(volk_32fc_convert_16ic, test_params)) - (VOLK_INIT_TEST(volk_32fc_s32f_power_spectrum_32f, test_params)) - (VOLK_INIT_TEST(volk_32fc_x2_square_dist_32f, test_params)) - (VOLK_INIT_TEST(volk_32fc_x2_s32f_square_dist_scalar_mult_32f, test_params)) - (VOLK_INIT_TEST(volk_32f_x2_divide_32f, test_params)) - (VOLK_INIT_TEST(volk_32f_x2_dot_prod_32f, test_params_inacc)) - (VOLK_INIT_TEST(volk_32f_x2_s32f_interleave_16ic, volk_test_params_t(1, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()))) - (VOLK_INIT_TEST(volk_32f_x2_interleave_32fc, test_params)) - (VOLK_INIT_TEST(volk_32f_x2_max_32f, test_params)) - (VOLK_INIT_TEST(volk_32f_x2_min_32f, test_params)) - (VOLK_INIT_TEST(volk_32f_x2_multiply_32f, test_params)) - (VOLK_INIT_TEST(volk_32f_s32f_normalize, test_params)) - (VOLK_INIT_TEST(volk_32f_s32f_power_32f, test_params)) - (VOLK_INIT_TEST(volk_32f_sqrt_32f, test_params_inacc)) - (VOLK_INIT_TEST(volk_32f_s32f_stddev_32f, test_params_inacc)) - (VOLK_INIT_TEST(volk_32f_stddev_and_mean_32f_x2, test_params_inacc)) - (VOLK_INIT_TEST(volk_32f_x2_subtract_32f, test_params)) - (VOLK_INIT_TEST(volk_32f_x3_sum_of_poly_32f, test_params_inacc)) - (VOLK_INIT_TEST(volk_32i_x2_and_32i, test_params)) - (VOLK_INIT_TEST(volk_32i_s32f_convert_32f, test_params)) - (VOLK_INIT_TEST(volk_32i_x2_or_32i, test_params)) - (VOLK_INIT_TEST(volk_32f_x2_dot_prod_16i, test_params)) - (VOLK_INIT_TEST(volk_64f_convert_32f, test_params)) - (VOLK_INIT_TEST(volk_64f_x2_max_64f, test_params)) - (VOLK_INIT_TEST(volk_64f_x2_min_64f, test_params)) - (VOLK_INIT_TEST(volk_8ic_deinterleave_16i_x2, test_params)) - (VOLK_INIT_TEST(volk_8ic_s32f_deinterleave_32f_x2, test_params)) - (VOLK_INIT_TEST(volk_8ic_deinterleave_real_16i, test_params)) - (VOLK_INIT_TEST(volk_8ic_s32f_deinterleave_real_32f, test_params)) - (VOLK_INIT_TEST(volk_8ic_deinterleave_real_8i, test_params)) - (VOLK_INIT_TEST(volk_8ic_x2_multiply_conjugate_16ic, test_params)) - (VOLK_INIT_TEST(volk_8ic_x2_s32f_multiply_conjugate_32fc, test_params)) - (VOLK_INIT_TEST(volk_8i_convert_16i, test_params)) - (VOLK_INIT_TEST(volk_8i_s32f_convert_32f, test_params)) - (VOLK_INIT_TEST(volk_32fc_s32fc_multiply_32fc, test_params)) - (VOLK_INIT_TEST(volk_32f_s32f_multiply_32f, test_params)) - (VOLK_INIT_TEST(volk_32f_binary_slicer_32i, test_params)) - (VOLK_INIT_TEST(volk_32f_binary_slicer_8i, test_params)) - (VOLK_INIT_TEST(volk_32f_tanh_32f, test_params_inacc)) - (VOLK_INIT_PUPP(volk_8u_x3_encodepolarpuppet_8u, volk_8u_x3_encodepolar_8u_x2, test_params)) - (VOLK_INIT_PUPP(volk_32f_8u_polarbutterflypuppet_32f, volk_32f_8u_polarbutterfly_32f, test_params)) - // no one uses these, so don't test them - //VOLK_PROFILE(volk_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex); - //VOLK_PROFILE(volk_16i_branch_4_state_8, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex); - //VOLK_PROFILE(volk_16i_max_star_16i, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); - //VOLK_PROFILE(volk_16i_max_star_horizontal_16i, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); - //VOLK_PROFILE(volk_16i_permute_and_scalar_add, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex); - //VOLK_PROFILE(volk_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex); - // we need a puppet for this one - //(VOLK_INIT_TEST(volk_32fc_s32f_x2_power_spectral_density_32f, test_params)) - - ; + std::vector test_cases; + QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params)) + QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params)) + QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params)) + QA(VOLK_INIT_PUPP(volk_16u_byteswappuppet_16u, volk_16u_byteswap, test_params)) + QA(VOLK_INIT_PUPP(volk_32u_byteswappuppet_32u, volk_32u_byteswap, test_params)) + QA(VOLK_INIT_PUPP(volk_32u_popcntpuppet_32u, volk_32u_popcnt_32u, test_params)) + QA(VOLK_INIT_PUPP(volk_64u_byteswappuppet_64u, volk_64u_byteswap, test_params)) + QA(VOLK_INIT_PUPP(volk_32fc_s32fc_rotatorpuppet_32fc, volk_32fc_s32fc_x2_rotator_32fc, test_params)) + QA(VOLK_INIT_PUPP(volk_8u_conv_k7_r2puppet_8u, volk_8u_x4_conv_k7_r2_8u, volk_test_params_t(0, test_params.scalar(), test_params.vlen(), test_params.iter()/10, test_params.benchmark_mode(), test_params.kernel_regex()))) + QA(VOLK_INIT_PUPP(volk_32f_x2_fm_detectpuppet_32f, volk_32f_s32f_32f_fm_detect_32f, test_params)) + QA(VOLK_INIT_TEST(volk_16ic_s32f_deinterleave_real_32f, test_params)) + QA(VOLK_INIT_TEST(volk_16ic_deinterleave_real_8i, test_params)) + QA(VOLK_INIT_TEST(volk_16ic_deinterleave_16i_x2, test_params)) + QA(VOLK_INIT_TEST(volk_16ic_s32f_deinterleave_32f_x2, test_params)) + QA(VOLK_INIT_TEST(volk_16ic_deinterleave_real_16i, test_params)) + QA(VOLK_INIT_TEST(volk_16ic_magnitude_16i, test_params_int1)) + QA(VOLK_INIT_TEST(volk_16ic_s32f_magnitude_32f, test_params)) + QA(VOLK_INIT_TEST(volk_16ic_convert_32fc, test_params)) + QA(VOLK_INIT_TEST(volk_16ic_x2_multiply_16ic, test_params)) + QA(VOLK_INIT_TEST(volk_16ic_x2_dot_prod_16ic, test_params)) + QA(VOLK_INIT_TEST(volk_16i_s32f_convert_32f, test_params)) + QA(VOLK_INIT_TEST(volk_16i_convert_8i, test_params)) + QA(VOLK_INIT_TEST(volk_16i_32fc_dot_prod_32fc, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32f_accumulator_s32f, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32f_x2_add_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32f_index_max_16u, test_params)) + QA(VOLK_INIT_TEST(volk_32f_index_max_32u, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_32f_multiply_32fc, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_32f_add_32fc, test_params)) + QA(VOLK_INIT_TEST(volk_32f_log2_32f, volk_test_params_t(3, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()))) + QA(VOLK_INIT_TEST(volk_32f_expfast_32f, volk_test_params_t(1e-1, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()))) + QA(VOLK_INIT_TEST(volk_32f_x2_pow_32f, volk_test_params_t(1e-2, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()))) + QA(VOLK_INIT_TEST(volk_32f_sin_32f, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32f_cos_32f, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32f_tan_32f, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32f_atan_32f, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32f_asin_32f, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32f_acos_32f, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32fc_s32f_power_32fc, test_params)) + QA(VOLK_INIT_TEST(volk_32f_s32f_calc_spectral_noise_floor_32f, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32fc_s32f_atan2_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_x2_conjugate_dot_prod_32fc, test_params_inacc_tenth)) + QA(VOLK_INIT_TEST(volk_32fc_deinterleave_32f_x2, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_deinterleave_64f_x2, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_s32f_deinterleave_real_16i, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_deinterleave_imag_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_deinterleave_real_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_deinterleave_real_64f, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_x2_dot_prod_32fc, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32fc_32f_dot_prod_32fc, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32fc_index_max_16u, volk_test_params_t(3, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()))) + QA(VOLK_INIT_TEST(volk_32fc_index_max_32u, volk_test_params_t(3, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()))) + QA(VOLK_INIT_TEST(volk_32fc_s32f_magnitude_16i, test_params_int1)) + QA(VOLK_INIT_TEST(volk_32fc_magnitude_32f, test_params_inacc_tenth)) + QA(VOLK_INIT_TEST(volk_32fc_magnitude_squared_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_x2_add_32fc, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_x2_multiply_32fc, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_x2_multiply_conjugate_32fc, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_x2_divide_32fc, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_conjugate_32fc, test_params)) + QA(VOLK_INIT_TEST(volk_32f_s32f_convert_16i, test_params)) + QA(VOLK_INIT_TEST(volk_32f_s32f_convert_32i, volk_test_params_t(1, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()))) + QA(VOLK_INIT_TEST(volk_32f_convert_64f, test_params)) + QA(VOLK_INIT_TEST(volk_32f_s32f_convert_8i, volk_test_params_t(1, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()))) + QA(VOLK_INIT_TEST(volk_32fc_convert_16ic, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_s32f_power_spectrum_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_x2_square_dist_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_x2_s32f_square_dist_scalar_mult_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32f_x2_divide_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32f_x2_dot_prod_32f, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32f_x2_s32f_interleave_16ic, volk_test_params_t(1, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()))) + QA(VOLK_INIT_TEST(volk_32f_x2_interleave_32fc, test_params)) + QA(VOLK_INIT_TEST(volk_32f_x2_max_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32f_x2_min_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32f_x2_multiply_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32f_64f_multiply_64f, test_params)) + QA(VOLK_INIT_TEST(volk_32f_64f_add_64f, test_params)) + QA(VOLK_INIT_TEST(volk_32f_s32f_normalize, test_params)) + QA(VOLK_INIT_TEST(volk_32f_s32f_power_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32f_sqrt_32f, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32f_s32f_stddev_32f, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32f_stddev_and_mean_32f_x2, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32f_x2_subtract_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32f_x3_sum_of_poly_32f, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32i_x2_and_32i, test_params)) + QA(VOLK_INIT_TEST(volk_32i_s32f_convert_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32i_x2_or_32i, test_params)) + QA(VOLK_INIT_TEST(volk_32f_x2_dot_prod_16i, test_params)) + QA(VOLK_INIT_TEST(volk_64f_convert_32f, test_params)) + QA(VOLK_INIT_TEST(volk_64f_x2_max_64f, test_params)) + QA(VOLK_INIT_TEST(volk_64f_x2_min_64f, test_params)) + QA(VOLK_INIT_TEST(volk_64f_x2_multiply_64f, test_params)) + QA(VOLK_INIT_TEST(volk_64f_x2_add_64f, test_params)) + QA(VOLK_INIT_TEST(volk_8ic_deinterleave_16i_x2, test_params)) + QA(VOLK_INIT_TEST(volk_8ic_s32f_deinterleave_32f_x2, test_params)) + QA(VOLK_INIT_TEST(volk_8ic_deinterleave_real_16i, test_params)) + QA(VOLK_INIT_TEST(volk_8ic_s32f_deinterleave_real_32f, test_params)) + QA(VOLK_INIT_TEST(volk_8ic_deinterleave_real_8i, test_params)) + QA(VOLK_INIT_TEST(volk_8ic_x2_multiply_conjugate_16ic, test_params)) + QA(VOLK_INIT_TEST(volk_8ic_x2_s32f_multiply_conjugate_32fc, test_params)) + QA(VOLK_INIT_TEST(volk_8i_convert_16i, test_params)) + QA(VOLK_INIT_TEST(volk_8i_s32f_convert_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_s32fc_multiply_32fc, test_params)) + QA(VOLK_INIT_TEST(volk_32f_s32f_multiply_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32f_binary_slicer_32i, test_params)) + QA(VOLK_INIT_TEST(volk_32f_binary_slicer_8i, test_params)) + QA(VOLK_INIT_TEST(volk_32u_reverse_32u, test_params)) + QA(VOLK_INIT_TEST(volk_32f_tanh_32f, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32f_s32f_mod_rangepuppet_32f, test_params)) + QA(VOLK_INIT_PUPP(volk_8u_x3_encodepolarpuppet_8u, volk_8u_x3_encodepolar_8u_x2, test_params)) + QA(VOLK_INIT_PUPP(volk_32f_8u_polarbutterflypuppet_32f, volk_32f_8u_polarbutterfly_32f, test_params)) + // no one uses these, so don't test them + //VOLK_PROFILE(volk_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex); + //VOLK_PROFILE(volk_16i_branch_4_state_8, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex); + //VOLK_PROFILE(volk_16i_max_star_16i, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); + //VOLK_PROFILE(volk_16i_max_star_horizontal_16i, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); + //VOLK_PROFILE(volk_16i_permute_and_scalar_add, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex); + //VOLK_PROFILE(volk_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex); + // we need a puppet for this one + //(VOLK_INIT_TEST(volk_32fc_s32f_x2_power_spectral_density_32f, test_params)) return test_cases; diff -Nru volk-1.3/lib/qa_utils.cc volk-1.4/lib/qa_utils.cc --- volk-1.3/lib/qa_utils.cc 2016-07-02 15:57:23.000000000 +0000 +++ volk-1.4/lib/qa_utils.cc 2018-03-26 22:52:55.000000000 +0000 @@ -1,25 +1,21 @@ +#include #include "qa_utils.h" -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include // for volk_func_desc_t +#include // for volk_free, volk_m... -#include -#include -#include -#include +#include // for assert +#include // for uint16_t, uint64_t +#include // for CLOCKS_PER_SEC +#include // for int16_t, int32_t +#include // for sqrt, fabs, abs +#include