SIMD

SIMD:single instruction multiply data
单指令多数据,多用于矢量运算中,可以加速指令运算,比如矩阵乘

SIMD的思想在不同平台架构下进行了实现

  • X86:SSE指令
  • ARM:NEON指令
  • MIPS:MSA

不同平台架构下为实现SIMD思想也添加了不同的硬件支持(专用寄存器):

  • X86:目前支持128bit寄存器,256bit,甚至是512bit
  • ARM:128bit
  • MIPS:128bit

为何SIMD可以更加快速的进行矩阵运算
一般流程:每次从内存加载一个数据寄存器进行运算
SIMD:一次加载多个数据到专用寄存器,则一条指令就能进行多个数据的加乘运算

SSE指令下:矩阵运算主要流程如下:
SIMD.png

简单来说就是:加载 -> 运算 -> 存储

  1. 加载:将数据从内存加载到专用寄存器,一次可以加载多个数据
  2. 运算:使用SSE乘/加命令讲点积和累加
  3. 存储:将结果保存到内存中
    具体命令可参考Intel SSE指令网址

MSA与NEON指令类似。
在矩阵运算中,SIMD优化视具体情况而定,除了SIMD,还可进行循环展开,缓存控制(将数据拆分至缓存可以存储大小,方便数据读取),通用寄存器使用(频繁使用的数据直接放至寄存器,C++使用register关键字)

具体代码参考:
Matrix.h

#include <iostream>
#include <cstdlib>
#include <stdio.h>
#include <assert.h>
using namespace std;



template <typename T>   //T is char, short or int;
class Matrix{

public:
    T **_Matrix;
    size_t _Row, _Column;
    Matrix() :_Matrix(nullptr), _Row(0), _Column(0) {}
    Matrix(size_t r, size_t c) :_Row(r), _Column(c) {
        if (!_Column || !_Row) return;
        _Matrix = (T**)_mm_malloc(_Row * sizeof(T*), 16);
        T **p = _Matrix, **end = _Matrix + _Row;
        do {
            *(p++) = (T*)_mm_malloc(_Column * sizeof(T), 16);
        } while (p != end);
    }
    Matrix(size_t r, size_t c, const T init) :_Row(r), _Column(c) {//use init to initialize the matrix
        if (!_Column || !_Row) return;
        _Matrix = (T**)_mm_malloc(_Row * sizeof(T*), 16);
        T **pr = _Matrix, **endr = _Matrix + _Row, *p, *end;
        do {
            p = *(pr++) = (T*)_mm_malloc(_Column * sizeof(T), 16);
            end = p + _Column;
            do {
                *(p++) = init;
            } while (p != end);
        } while (pr != endr);
    }
    Matrix(const Matrix& B) {//拷贝构造
        _Row = B._Row;
        _Column = B._Column;
        _Matrix = (T**)_mm_malloc(_Row * sizeof(T*), 16);
        T **pbr = B._Matrix, **endbr = B._Matrix + _Row, *pb, *endb,
            **par = _Matrix, **endar = _Matrix + _Row, *pa, *enda;
        do {
            pa = *(par++) = (T*)_mm_malloc(_Column * sizeof(T), 16);
            enda = pa + _Column;
            pb = *(pbr++);
            endb = pb + _Column;
            do {
                *(pa++) = *(pb++);
            } while (pa != enda);
        } while (par != endar);
    }
    ~Matrix() {//析构
        if (!_Matrix || !_Column || !_Row) return;
        T **p = _Matrix, **end = _Matrix + _Row;
        do {
            _mm_free(*(p++));
        } while (p != end);
        _Column = _Row = 0;
        _mm_free(_Matrix);
    }
    T& operator()(size_t i, size_t j) { return _Matrix[i][j]; }//get the  elements of row i and column j

    const T operator()(size_t i, size_t j)const { return _Matrix[i][j]; }//get the  elements of row i and column j

    Matrix& operator=(Matrix& B) {//
        if (_Matrix && _Row && _Column) {
            T **p = _Matrix, **end = _Matrix + _Row;
            do {
                _mm_free(*(p++));
            } while (p != end);
            _mm_free(_Matrix);
        }
        _Row = B._Row;
        _Column = B._Column;
        _Matrix = B._Matrix;
        B._Matrix = nullptr;
        return *this;
    }

    //transpose the matrix
    Matrix Tranpose() const{
        Matrix temp(_Column, _Row);
        for (int i = 0; i < _Row; i++) {
            for (int j = 0; j < _Column; j++) {
                temp(j, i) = (*this)(i, j);
            }
        }
        return temp;
    }

    //alignas
    Matrix _alignas(int align_row, int align_col) {
        int rest_row = align_row - _Row % align_row, rest_col = align_col - _Column % align_col;
        if (rest_row || rest_col) {
            int row = _Row + rest_row, col = _Column + rest_col;
            Matrix temp(row, col, 0);
            T **pbr = _Matrix, **endbr = _Matrix + _Row, *pb, *endb,
                **par = temp._Matrix, *pa;
            do {
                pa = *(par++);
                pb = *(pbr++);
                endb = pb + _Column;
                do {
                    *(pa++) = *(pb++);
                } while (pb != endb);
            } while (pbr != endbr);
            return temp;
        }
        return *this;
    }
    void print() {
        cout << "the matrix is:\n" << endl;
        int column_min = 10 < _Column ? 10 : _Column;
        int row_min = 10 <_Row ? 10 : _Row;
        for (short i = 0; i < row_min; i++) {
            for (short j = 0; j < column_min; j++) {
                cout << (short)_Matrix[i][j] << "  ";
            }
            cout << endl;
        }
    }
};

Vector.h

#pragma once
#include <iostream>
#include <cstdlib>
#include <stdio.h>
#include <assert.h>
using namespace std;



template <typename T>   //T is char, short or int;
class Vector {

public:
    T *_Vector;
    size_t _Column;
    Vector() :_Vector(NULL),  _Column(0) {}
    Vector(size_t c) : _Column(c) {
        if (!_Column) return;
        _Vector = (T*)_mm_malloc(_Column * sizeof(T), 16);

    }
    Vector(size_t c, const T init) : _Column(c) {//use init to initialize the Vector
        if (!_Column) return;
        _Vector = (T*)_mm_malloc(_Column * sizeof(T), 16);
        T *pr = _Vector, *endr = _Vector + _Column, *p, *end;
        do {
            *(pr++) = init; 
        } while (pr != endr);
    }
    Vector(const Vector& B) {//拷贝构造
        _Column = B._Column;
        _Vector = (T*)_mm_malloc(_Column * sizeof(T*), 16);
        T *pbr = B._Vector, *endbr = B._Vector + _Column, *par = _Vector;
        do {
            *(par++) = *(pbr++);
        } while (pbr != endbr);
    }
    ~Vector() {//析构
        if (!_Vector) return;
        T *p = _Vector;
        _mm_free(_Vector);
    }
    T& operator()(size_t i) { return _Vector[i]; }//get the  elements of row i and column j

    const T operator()(size_t i)const { return _Vector[i]; }//get the  elements of row i and column j

    
    Vector& operator=(Vector& B) {//
        if (_Column && _Vector != NULL) {
            _mm_free(_Vector);
        }
        _Column = B._Column;
        _Vector = B._Vector;
        B._Vector = nullptr;
        return *this;
    }
    

    //alignas(16)
    Vector _alignas(int align_size) {
        int rest = align_size - _Column % align_size;
        if (rest) {
            int col = _Column + rest;
            Vector temp(col, 0);
            T *p = _Vector, *end = _Vector + _Column, *t = temp._Vector;
            do {
                *(t++) = *(p++);
            } while (p != end);
            return temp;
        }
        return *this;
    }
    //print
    void print() {
        cout << "the Vector Col is:  " <<_Column << "\nThe Vector is:\n" << endl;
        int column_min = 10 < _Column ? 10 : _Column;
        for (int i = 0; i < column_min; i++) {
            cout << (short)_Vector[i] << "  ";
        }
        cout << endl;
    }

};


Operator.h

#pragma once
#include <iostream>
#include <cstdlib>
#include <assert.h>
#include "Matrix.h"
#include "Vector.h"
#include <emmintrin.h>                 
#include <immintrin.h> 
using namespace std;

///Vec*Matrix
template <class T>
Vector<T> Vec_Mul_Mat_C(const Vector<T> &V, const Matrix<T> &A) {
    if (V._Column != A._Row) {
        cout << "The col of vector and row of matrix is not same, please check!!!" << endl;
        return NULL;
    }
    Vector<T> ret(A._Column, 0);
    int sum = 0;
    for (int i = 0; i < A._Column; ++i) {
        for (int j = 0; j < V._Column; ++j) {
            ret(i) += V(j) * A(j, i);
        }
    }
    return ret;
}


template <class T>
Vector<T> Vec_Mul_Mat_SSE(const Vector<T> &V, const Matrix<T> &A) {
    if (V._Column != A._Row) {
        cout << "The col of vector and row of matrix is not same, please check!!!" << endl;
        return NULL;
    }

    Vector<T> ret(A._Column, 0);
    Matrix<T> C = A.Tranpose();
    
    if (sizeof(T) == 1) {
        __m128i X, Y;
        __m128i acc;
        alignas(16) short temp[8];
        int i(0), j(0);
        for (i = 0; i < C._Row; ++i) {
            acc = _mm_setzero_si128();
            for (j = 0; j + 16 <= V._Column; j += 16) {
                X = _mm_load_si128((__m128i*)V._Vector);
                Y = _mm_load_si128(((__m128i*)&(C._Matrix[i][j])));

                acc = _mm_add_epi16(acc, _mm_maddubs_epi16(X, Y));
            }
            _mm_store_si128((__m128i*)&temp[0], acc);
            ret(i) = temp[0] + temp[1] + temp[2] + temp[3] + temp[4] + temp[5] + temp[6] + temp[7];
        }
    }
    else if (sizeof(T) == 2) {
        Vector<T> ret(A._Column, 0);
        __m128i X, Y;
        __m128i acc;
        alignas(16) int temp[4];
        int i(0), j(0);
        for (i = 0; i < C._Row; ++i) {
            acc = _mm_setzero_si128();
            for (j = 0; j + 8 <= V._Column; j += 8) {
                X = _mm_load_si128((__m128i*)V._Vector);
                Y = _mm_load_si128(((__m128i*)&(C._Matrix[i][j])));

                acc = _mm_add_epi32(acc, _mm_madd_epi16(X, Y));
            }
            _mm_store_si128((__m128i*)&temp[0], acc);
            ret(i) = temp[0] + temp[1] + temp[2] + temp[3];
        }
    }
    return ret;
}



//matrix *matrix C
template <class T>
Matrix<T> Matrix_Mul_C(const Matrix<T> &A, const Matrix<T> &B) {

    Matrix<T> retC(A._Row, B._Column, 0);
    if (A._Column != B._Row) {
        cout << "The col of vector and row of matrix is not same, please check!!!" << endl;
        return retC;
    }
    
    for (short i = 0; i < A._Row; i++) {
        for (short j = 0; j < B._Column; j++) {
            for (short k = 0; k < A._Column; k++) {
                retC(i, j) += A(i, k)*B(k, j);
            }
        }
    }
    return retC;
}


//unroll-loop for 1*8
template <class T>
Matrix<T> Matrix_Mul_C_(const Matrix<T> &A, const Matrix<T> &B) {
    
    Matrix<T> retC(A._Row, B._Column, 0);
    if (A._Column != B._Row) {
        cout << "The col of vector and row of matrix is not same, please check!!!" << endl;
        return retC;
    }

    
    int i, j, k;
    for (i = 0; i < A._Row; ++i) {
        for (j = 0; j < B._Column; j++) {
            for (k = 0; k + 8 <= A._Column; k += 8) {
                retC(i, j) += A(i, k)*B(k, j) + A(i, k + 1)*B(k + 1, j) + A(i, k + 2)*B(k + 2, j) + A(i, k + 3)*B(k + 3, j)
                    + A(i, k + 4)*B(k + 4, j) + A(i, k + 5)*B(k + 5, j) + A(i, k + 6)*B(k + 6, j) + A(i, k + 7)*B(k + 7, j);
            }
        }
    }
    return retC;
}


//unroll-loop for 8*8
template <class T>
Matrix<T> Matrix_Mul_CC_(const Matrix<T> &A, const Matrix<T> &B) {
    Matrix<T> retC(A._Row, B._Column, 0);
    int i, j, k;
    for (i = 0; i + 8 <= A._Row; i += 8) {
        for (j = 0; j < B._Column; j++) {
            for (k = 0; k + 8 <= A._Column; k += 8) {
                for (int t = 0; t < 8; t++) {
                    retC(i + t, j) += A(i + t, k)*B(k, j) + A(i + t, k + 1)*B(k + 1, j) + A(i + t, k + 2)*B(k + 2, j) + A(i + t, k + 3)*B(k + 3, j)
                        + A(i + t, k + 4)*B(k + 4, j) + A(i + t, k + 5)*B(k + 5, j) + A(i + t, k + 6)*B(k + 6, j) + A(i + t, k + 7)*B(k + 7, j);
                }
            }
        }
    }
    return retC;
}



template<class T>
Matrix<T> Matrix_Mul_SSE(const Matrix<T> &A, const Matrix<T> &B) {
    Matrix<T> retSSE(A._Row, B._Column, 0);
    if (A._Column != B._Row) {
        cout << "The col of vector and row of matrix is not same, please check!!!" << endl;
        return retSSE;
    }
    Matrix<T> C = B.Tranpose();
    __m128i X, Y;
    __m128i acc = _mm_setzero_si128();

    int i, j, k;
    if(sizeof(T) == 1) {
        alignas(16) short temp[8];
        for (i = 0; i < A._Row; ++i) {
            for (j = 0; j < C._Row; ++j) {
                acc = _mm_setzero_si128();
                for (k = 0; k + 16 <= A._Column; k += 16) {
                    X = _mm_load_si128((__m128i*)&(A._Matrix[i][k]));
                    Y = _mm_load_si128((__m128i*)&(C._Matrix[j][k]));
                    acc = _mm_add_epi16(acc, _mm_maddubs_epi16(X, Y));
                }
                _mm_store_si128((__m128i*)&temp[0], acc);
                retSSE(i, j) = temp[0] + temp[1] + temp[2] + temp[3] + temp[4] + temp[5] + temp[6] + temp[7];
            }
        }
    }
    else if(sizeof(T) == 2) {
        alignas(16) int temp[4];
        for (i = 0; i < A._Row; ++i) {
            for (j = 0; j < C._Row; ++j) {
                acc = _mm_setzero_si128();
                for (k = 0; k + 8 <= A._Column; k += 8) {
                    X = _mm_load_si128((__m128i*)&(A._Matrix[i][k]));
                    Y = _mm_load_si128((__m128i*)&(C._Matrix[j][k]));
                    acc = _mm_add_epi32(acc, _mm_madd_epi16(X, Y));
                }
                _mm_store_si128((__m128i*)&temp[0], acc);
                retSSE(i, j) = temp[0] + temp[1] + temp[2] + temp[3];
                //retC(i, j) = inner_prod;
            }
        }
    }   
    return retSSE;
}


///compare the result of C and SSE
template <class T>
bool C_Compare_SSE(Matrix<T> &C_Matrix, Matrix<T> SSE_Matrix) {
    int i(C_Matrix._Row);
    while (--i >= 0) {
        if (memcmp(C_Matrix._Matrix[i], SSE_Matrix._Matrix[i], C_Matrix._Column * sizeof(T)) != 0) {
            return false;
        }
    }
    return true;
}

main.c

#include <stdio.h>
#include "Operate.h"
#include <cstdlib>
#include <time.h>

//Vec * Matrix
template <class T>
void Vec_Mat() {
    int N = 251;
    Matrix<T> A(10 * N, N*3, 2);
    double start, dt;
    A(3, 1) = 3;
    Vector<T> C(10 * N, 2);
    Vector<T> C1 = C._alignas(16);
    Matrix<T> C_Trans = A._alignas(16, 16); 

    cout << "\nThe time of C Vec * Matrix: " << endl;
    start = clock();
    int cir = 100;
    Vector<T> ret, retSSE;
    while (cir-- > 0) {
        Vec_Mul_Mat_C(C1, C_Trans);
    }
    dt = static_cast<double>(clock() - start) / CLOCKS_PER_SEC;
    cout << dt / 100 << "s" << endl;
    ret = Vec_Mul_Mat_C(C1, C_Trans);
    //ret.print();


    cout << "\nThe time of SSE Vec * Matrix :" << endl;
    start = clock();
    cir = 100;
    while (cir-- > 0) {
        Vec_Mul_Mat_SSE(C1, C_Trans);
    }

    dt = static_cast<double>(clock() - start) / CLOCKS_PER_SEC;
    cout << dt / 100 << "s" << endl;
    retSSE = Vec_Mul_Mat_SSE(C1, C_Trans);
    //retSSE.print();

    if (memcmp(ret._Vector, retSSE._Vector, ret._Column * sizeof(char)) == 0) {
        cout << "C and SSE have the same result " << endl;
    }
}

template <class T>
void Mat_Mat() {
    int N = 254;
    Matrix<T> A(N, N, 2), B(N, N, 1);
    A(3, 1) = 3, B(1, 0) = 4; B(2, 0) = 3;
    Matrix<T> A_align = A._alignas(16, 16);
    Matrix<T> B_align = B._alignas(16, 16);
    double start, dt;

    cout << "\nThe time of C Matrix * Matrix: " << endl;
    start = clock();
    int cir = 100;
    Matrix<T> ret, retSSE;
    while (cir-- > 0) {
        Matrix_Mul_C(A_align, B_align);
    }
    dt = static_cast<double>(clock() - start) / CLOCKS_PER_SEC;
    cout << dt / 100 << "s" << endl;
    ret = Matrix_Mul_C(A_align, B_align);
    //ret.print();
    

    /*
    start = clock();
    cir = 100;
    while (cir-- > 0) {
        Matrix_Mul_C_(A_align, B_align);
    }

    dt = static_cast<double>(clock() - start) / CLOCKS_PER_SEC;
    cout << dt / 100 << "s" << endl;

    start = clock();
    cir = 100;
    while (cir-- > 0) {
        Matrix_Mul_CC_(A_align, B_align);
    }

    dt = static_cast<double>(clock() - start) / CLOCKS_PER_SEC;
    cout << dt / 100 << "s" << endl;

*/

    cout << "\nThe time of SSE Matrix * Matrix :" << endl;
    start = clock();
    cir = 100;
    while (cir-- > 0) {
        Matrix_Mul_SSE(A_align, B_align);
    }

    dt = static_cast<double>(clock() - start) / CLOCKS_PER_SEC;
    cout << dt / 100 << "s" << endl;
    retSSE = Matrix_Mul_SSE(A_align, B_align);
    retSSE.print();


    start = clock();
    cir = 100;
    while (cir-- > 0) {
        Matrix_Mul_SSE_8(A_align, B_align);
    }

    dt = static_cast<double>(clock() - start) / CLOCKS_PER_SEC;
    cout << dt / 100 << "s" << endl;
    retSSE = Matrix_Mul_SSE_8(A_align, B_align);
    retSSE.print();

    /*
    cout << "\nThe time of SSE Vec * Matrix :" << endl;
    start = clock();
    cir = 10;
    while (cir-- > 0) {
        retSSE = Matrix_Mul_SSE_8(A_align, B_align);
    }

    dt = static_cast<double>(clock() - start) / CLOCKS_PER_SEC;
    cout << dt / 10 << "s" << endl;
    retSSE.print();
    */
    if (C_Compare_SSE(ret, retSSE)) {
        cout << "C and SSE have the same result " << endl;
    }
    
}


void main(short argc, char* argv[]) {

    //Vec_Mat<short>();

    Mat_Mat<short>();
    cout << "hello" << endl;
    return;
}

其中,Operator.h主要是进行矩阵运算,为了方便进行char和short类型的数据运算,使用了模板,初次使用,莫怪
本文思想[参考网址]。(https://blog.csdn.net/artorias123/article/details/86527456)

最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 220,063评论 6 510
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 93,805评论 3 396
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 166,403评论 0 357
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 59,110评论 1 295
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 68,130评论 6 395
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 51,877评论 1 308
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 40,533评论 3 420
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 39,429评论 0 276
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 45,947评论 1 319
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 38,078评论 3 340
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 40,204评论 1 352
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 35,894评论 5 347
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 41,546评论 3 331
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 32,086评论 0 23
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 33,195评论 1 272
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 48,519评论 3 375
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 45,198评论 2 357