coding: Медленное умножение матриц 1024х1024

суббота, 14 декабря 2019 г.

Медленное умножение матриц 1024х1024

#кэширование #матрицы #любой_язык

Почему матрицы размеров 1025x1025, 1023x1023 перемножаются быстрее матриц 1024x1024
стандартным алгоритмом?

Ответы

Ответ 1


В комментарий не влезет, но это не ответ. Это именно комментарий.

Написал для проверки (см. ниже). Обалдел, ибо у меня, скомпилированное VC++ 2017,
таки дало:

1023: 2106433
1024: 7664347
1025: 2106884


При отключенной оптимизации эффект выражен меньше:

1023: 9592402
1024: 11608342
1025: 9480406


Это для 64-разрядного приложения. 32-разрядное, впрочем, почти такое же.

1023: 2910957
1024: 7729545
1025: 2192236


"А я что? сама в шоке!" (с) Анекдот про пограничную собаку...

Вот код.

#include 
#include 
#include 
#include 
#include 

using namespace std;

class muTimer
{
    using Clock = std::chrono::high_resolution_clock;
    bool active = false;
    Clock::duration   duration_;
    Clock::time_point start_ = Clock::now(), stop_ = Clock::now();

    muTimer(const muTimer&)             = delete;
    muTimer& operator=(const muTimer&)  = delete;
public:
    using ns       = std::chrono::nanoseconds;
    using mks      = std::chrono::microseconds;
    using ms       = std::chrono::milliseconds;
    muTimer() { reset(); start(); }
    ~muTimer() = default;
    muTimer& reset()
    {
        duration_ = std::chrono::nanoseconds(0);
        active    = false;
        return *this;
    }
    muTimer& start()
    {
        if (!active)
        {
            start_ = Clock::now();
            active = true;
        }
        return *this;
    }
    muTimer& stop()
    {
        if (active)
        {
            stop_      = Clock::now();
            duration_ += stop_ - start_;
            active     = false;
        }
        return *this;
    }
    template
        unsigned long long duration()
    {
        return static_cast
            (std::chrono::duration_cast(stop_-start_).count());
    }
};


double m1024_1[1024][1024], m1024_2[1024][1024], m1024_3[1024][1024];
double m1025_1[1025][1025], m1025_2[1025][1025], m1025_3[1025][1025];
double m1023_1[1023][1023], m1023_2[1023][1023], m1023_3[1023][1023];

int main(int argc, const char * argv[])
{
    for(int i = 0; i < 1024; ++i)
        for(int j = 0; j < 1024; ++j)
        {
            m1024_1[i][j] = rand();
            m1024_2[i][j] = rand();
        }
    for(int i = 0; i < 1025; ++i)
        for(int j = 0; j < 1025; ++j)
        {
            m1025_1[i][j] = rand();
            m1025_2[i][j] = rand();
        }
    for(int i = 0; i < 1023; ++i)
        for(int j = 0; j < 1023; ++j)
        {
            m1023_1[i][j] = rand();
            m1023_2[i][j] = rand();
        }

    {
        muTimer mt;
        for(int i = 0; i < 1023; ++i)
            for(int j = 0; j < 1023; ++j)
            {
                double s = 0.0;
                for(int k = 0; k < 1023; ++k)
                {
                    s += m1023_1[i][k]*m1023_2[k][j];
                }
                m1023_3[i][j] = s;
            }
        mt.stop();
        cout << "1023: " << mt.duration() << endl;
    }
    {
        muTimer mt;
        for(int i = 0; i < 1024; ++i)
            for(int j = 0; j < 1024; ++j)
            {
                double s = 0.0;
                for(int k = 0; k < 1024; ++k)
                {
                    s += m1024_1[i][k]*m1024_2[k][j];
                }
                m1024_3[i][j] = s;
            }
        mt.stop();
        cout << "1024: " << mt.duration() << endl;
    }
    {
        muTimer mt;
        for(int i = 0; i < 1025; ++i)
            for(int j = 0; j < 1025; ++j)
            {
                double s = 0.0;
                for(int k = 0; k < 1025; ++k)
                {
                    s += m1025_1[i][k]*m1025_2[k][j];
                }
                m1025_3[i][j] = s;
            }
        mt.stop();
        cout << "1025: " << mt.duration() << endl;
    }

}


При 1024 ассемблерный код отличается:

Вот основной код для 1024:

; 104  :                 for(int k = 0; k < 1024; ++k)
; 105  :                 {
; 106  :                     s += m1024_1[i][k]*m1024_2[k][j];

    movsd   xmm1, QWORD PTR [rcx-8192]
    mulsd   xmm1, QWORD PTR [rax-8]
    movsd   xmm0, QWORD PTR [rax]
    mulsd   xmm0, QWORD PTR [rcx]
    addsd   xmm1, xmm2
    movaps  xmm2, xmm1
    movsd   xmm1, QWORD PTR [rcx+8192]
    mulsd   xmm1, QWORD PTR [rax+8]
    addsd   xmm2, xmm0
    movsd   xmm0, QWORD PTR [rcx+16384]
    mulsd   xmm0, QWORD PTR [rax+16]
    addsd   xmm2, xmm1
    movsd   xmm1, QWORD PTR [rcx+24576]
    mulsd   xmm1, QWORD PTR [rax+24]
    addsd   xmm2, xmm0
    movsd   xmm0, QWORD PTR [rcx+32768]
    mulsd   xmm0, QWORD PTR [rax+32]
    addsd   xmm2, xmm1
    movsd   xmm1, QWORD PTR [rcx+40960]
    mulsd   xmm1, QWORD PTR [rax+40]
    addsd   xmm2, xmm0
    movsd   xmm0, QWORD PTR [rcx+49152]
    mulsd   xmm0, QWORD PTR [rax+48]
    add rcx, 65536              ; 00010000H
    add rax, 64                 ; 00000040H
    addsd   xmm2, xmm1
    addsd   xmm2, xmm0
    sub r8, 1
    jne $LL37@main


А вот - для 1023:

; 89   :                 for(int k = 0; k < 1023; ++k)
; 90   :                 {
; 91   :                     s += m1023_1[i][k]*m1023_2[k][j];

    movsd   xmm1, QWORD PTR [rcx-8184]
    mulsd   xmm1, QWORD PTR [rax-8]
    movsd   xmm0, QWORD PTR [rcx]
    mulsd   xmm0, QWORD PTR [rax]
    addsd   xmm1, xmm2
    movaps  xmm2, xmm1
    movsd   xmm1, QWORD PTR [rcx+8184]
    mulsd   xmm1, QWORD PTR [rax+8]
    addsd   xmm2, xmm0
    add rax, 24
    add rcx, 24552              ; 00005fe8H
    addsd   xmm2, xmm1
    sub rdx, 1
    jne SHORT $LL28@main


Объяснений не даю, сам хочу услышать :)

P.S. Вносим единственное изменение:

double m1024_1[1025][1025], m1024_2[1025][1025], m1024_3[1025][1025];
double m1025_1[1025][1025], m1025_2[1025][1025], m1025_3[1025][1025];
double m1023_1[1025][1025], m1023_2[1025][1025], m1023_3[1025][1025];


Имеем:

1023: 1973593
1024: 1983533
1025: 1981114


Ответ 2


Решил и свою лепту внести на java.


  3615298956
   4363341525
   4608966672


Новые ответы считает
Считало где-то по 10 минут

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package time;

/**
 *
 * @author milan
 */
public class Time {

    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) {
        System.out.println(i1023());
        System.out.println(i1024());
        System.out.println(i1025());



    }

    public static long i1023() {
        int mas1023_1[][] = new int[1023][1023];
        int mas1023_2[][] = new int[1023][1023];

        long ans[] = new long[10];
        long time1023 = 0;
        for (int k = 0; k < 10; k++) {
            long start;
            long finish;
            int[][] res = new int[1023][1023];

            start = System.nanoTime();
            for (int i = 0; i < 1023; i++) {
                for (int j = 0; j < 1023; j++) {
                    for (int l = 0; l < 1023; l++) {
                        res[i][j] += mas1023_1[i][l] * mas1023_2[l][j];
                    }
                }
            }
            finish = System.nanoTime();
            time1023 = finish - start;
            //System.out.println(time1023);
            ans[k] = time1023;
        }
        long sum = 0;
        for (int i = 0; i < 10; i++) {
            sum += ans[i];
        }
        sum = sum / 10;
        return sum;
    }

    public static long i1024() {
        int mas1024_1[][] = new int[1024][1024];
        int mas1024_2[][] = new int[1024][1024];
        long ans[] = new long[10];
        long time1024 = 0;
        for (int k = 0; k < 10; k++) {
            long start;
            long finish;
            int[][] res = new int[1024][1024];

            start = System.nanoTime();
            for (int i = 0; i < 1024; i++) {
                for (int j = 0; j < 1024; j++) {
                    for (int l = 0; l < 1024; l++) {
                        res[i][j] += mas1024_1[i][l] * mas1024_2[l][j];
                    }
                }
            }
            finish = System.nanoTime();
            time1024 = finish - start;
            //System.out.println(time1024);
            ans[k] = time1024;
        }
        long sum = 0;
        for (int i = 0; i < 10; i++) {
            sum += ans[i];
        }
        sum = sum / 10;
        return sum;
    }

    public static long i1025() {
        int mas1025_1[][] = new int[1025][1025];
        int mas1025_2[][] = new int[1025][1025];
        long ans[] = new long[10];
        long time1025 = 0;
        for (int k = 0; k < 10; k++) {
            long start;
            long finish;
            int[][] res = new int[1025][1025];
            start = System.nanoTime();
            for (int i = 0; i < 1025; i++) {
                for (int j = 0; j < 1025; j++) {
                    for (int l = 0; l < 1025; l++) {
                        res[i][j] += mas1025_1[i][l] * mas1025_2[l][j];
                    }
                }
            }
            finish = System.nanoTime();
            time1025 = finish - start;
            //System.out.println(time1025);
            ans[k] = time1025;
        }
        long sum = 0;
        for (int i = 0; i < 10; i++) {
            sum += ans[i];
        }
        sum = sum / 10;
        return sum;
    }
}

coding

Страницы

Поиск по вопросам

суббота, 14 декабря 2019 г.

Медленное умножение матриц 1024х1024

Ответы

Ответ 1

Ответ 2

Комментариев нет:

Отправить комментарий

Страницы

Поиск по вопросам

суббота, 14 декабря 2019 г.

Медленное умножение матриц 1024х1024

Ответы

Ответ 1

Ответ 2

Комментариев нет:

Отправить комментарий

суббота, 14 декабря 2019 г.