214 lines
5.5 KiB
C
214 lines
5.5 KiB
C
/*
|
|
* trans.c - Matrix transpose B = A^T
|
|
*
|
|
* Each transpose function must have a prototype of the form:
|
|
* void trans(int M, int N, int A[N][M], int B[M][N]);
|
|
*
|
|
* A transpose function is evaluated by counting the number of misses
|
|
* on a 1KB direct mapped cache with a block size of 32 bytes.
|
|
*/
|
|
#include <stdio.h>
|
|
#include "cachelab.h"
|
|
|
|
int is_transpose(int M, int N, int A[N][M], int B[M][N]);
|
|
|
|
/*
|
|
* transpose_submit - This is the solution transpose function that you
|
|
* will be graded on for Part B of the assignment. Do not change
|
|
* the description string "Transpose submission", as the driver
|
|
* searches for that string to identify the transpose function to
|
|
* be graded.
|
|
*/
|
|
char transpose_submit_desc[] = "Transpose submission";
|
|
void block_8x8_32(int t, int l, int A[32][32], int B[32][32]) {
|
|
for (int i = t; i < t + 8; i++) {
|
|
for (int j = l; j < l + 8; j++) {
|
|
B[j][i] = A[i][j];
|
|
}
|
|
}
|
|
}
|
|
void block_8x8_64(int t, int l, int A[64][64], int B[64][64]) {
|
|
int v0, v1, v2, v3;
|
|
for (int i = t; i < t + 4; i++) {
|
|
for (int j = l; j < l + 4; j++) {
|
|
B[j][i] = A[i][j];
|
|
}
|
|
for (int j = l + 4; j < l + 8; j++) {
|
|
B[j - 4][i + 4] = A[i][j];
|
|
}
|
|
}
|
|
|
|
for (int j = l; j < l + 4; j++) {
|
|
v0 = B[j][t + 4];
|
|
v1 = B[j][t + 5];
|
|
v2 = B[j][t + 6];
|
|
v3 = B[j][t + 7];
|
|
|
|
B[j][t + 4] = A[t + 4][j];
|
|
B[j][t + 5] = A[t + 5][j];
|
|
B[j][t + 6] = A[t + 6][j];
|
|
B[j][t + 7] = A[t + 7][j];
|
|
|
|
B[j + 4][t] = v0;
|
|
B[j + 4][t + 1] = v1;
|
|
B[j + 4][t + 2] = v2;
|
|
B[j + 4][t + 3] = v3;
|
|
}
|
|
|
|
for (int i = t + 4; i < t + 8; i++) {
|
|
for (int j = l + 4; j < l + 8; j++) {
|
|
B[j][i] = A[i][j];
|
|
}
|
|
}
|
|
}
|
|
void transpose_submit(int M, int N, int A[N][M], int B[M][N]) {
|
|
if (M == 32) {
|
|
for (int i = 0; i < N; i += 8) {
|
|
for (int ii = i; ii < i + 8; ii++) {
|
|
for (int jj = i; jj < i + 8; jj++) {
|
|
B[ii][(jj + 8) % N] = A[ii][jj];
|
|
}
|
|
}
|
|
for (int ii = i; ii < i + 8; ii++) {
|
|
for (int jj = i; jj < i + 8; jj++) {
|
|
B[jj][ii] = B[ii][(jj + 8) % N];
|
|
}
|
|
}
|
|
|
|
block_8x8_32((i + 8) % N, i, A, B);
|
|
}
|
|
|
|
for (int i = 0; i < N; i += 8) {
|
|
for (int j = 0; j < M; j += 8) {
|
|
if (i != j && (j + 8) % N != i) {
|
|
block_8x8_32(i, j, A, B);
|
|
}
|
|
}
|
|
}
|
|
} else if (M == 64) {
|
|
for (int i = 0; i < N; i += 8) {
|
|
int v0, v1, v2, v3;
|
|
for (int ii = i; ii < i + 4; ii++) {
|
|
for (int jj = i; jj < i + 8; jj++) {
|
|
B[ii][(jj + 8) % N] = A[ii][jj];
|
|
}
|
|
}
|
|
for (int ii = i + 4; ii < i + 8; ii++) {
|
|
for (int jj = i; jj < i + 8; jj++) {
|
|
B[ii - 4][(jj + 16) % N] = A[ii][jj];
|
|
}
|
|
}
|
|
|
|
for (int ii = i; ii < i + 4; ii++) {
|
|
for (int jj = i; jj < i + 4; jj++) {
|
|
B[jj][ii] = B[ii][(jj + 8) % N];
|
|
}
|
|
for (int jj = i + 4; jj < i + 8; jj++) {
|
|
B[jj - 4][ii + 4] = B[ii][(jj + 8) % N];
|
|
}
|
|
}
|
|
|
|
for (int jj = i; jj < i + 4; jj++) {
|
|
v0 = B[jj][i + 4];
|
|
v1 = B[jj][i + 5];
|
|
v2 = B[jj][i + 6];
|
|
v3 = B[jj][i + 7];
|
|
|
|
B[jj][i + 4] = B[i][(jj + 16) % N];
|
|
B[jj][i + 5] = B[i + 1][(jj + 16) % N];
|
|
B[jj][i + 6] = B[i + 2][(jj + 16) % N];
|
|
B[jj][i + 7] = B[i + 3][(jj + 16) % N];
|
|
|
|
B[jj + 4][i] = v0;
|
|
B[jj + 4][i + 1] = v1;
|
|
B[jj + 4][i + 2] = v2;
|
|
B[jj + 4][i + 3] = v3;
|
|
}
|
|
|
|
for (int ii = i + 4; ii < i + 8; ii++) {
|
|
for (int jj = i + 4; jj < i + 8; jj++) {
|
|
B[jj][ii] = B[ii - 4][(jj + 16) % N];
|
|
}
|
|
}
|
|
|
|
block_8x8_64((i + 8) % N, i, A, B);
|
|
block_8x8_64((i + 16) % N, i, A, B);
|
|
}
|
|
|
|
for (int i = 0; i < N; i += 8) {
|
|
for (int j = 0; j < M; j += 8) {
|
|
if (i != j && (j + 8) % N != i && (j + 16) % N != i) {
|
|
block_8x8_64(i, j, A, B);
|
|
}
|
|
}
|
|
}
|
|
} else if (M == 61) {
|
|
for (int j = 0; j < M; j += 17) {
|
|
for (int i = 0; i < N; i++) {
|
|
for (int jj = j; jj < j + 17 && jj < M; jj++) {
|
|
B[jj][i] = A[i][jj];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* You can define additional transpose functions below. We've defined
|
|
* a simple one below to help you get started.
|
|
*/
|
|
|
|
/*
|
|
* trans - A simple baseline transpose function, not optimized for the cache.
|
|
*/
|
|
char trans_desc[] = "Simple row-wise scan transpose";
|
|
void trans(int M, int N, int A[N][M], int B[M][N])
|
|
{
|
|
int i, j, tmp;
|
|
|
|
for (i = 0; i < N; i++) {
|
|
for (j = 0; j < M; j++) {
|
|
tmp = A[i][j];
|
|
B[j][i] = tmp;
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
/*
|
|
* registerFunctions - This function registers your transpose
|
|
* functions with the driver. At runtime, the driver will
|
|
* evaluate each of the registered functions and summarize their
|
|
* performance. This is a handy way to experiment with different
|
|
* transpose strategies.
|
|
*/
|
|
void registerFunctions()
|
|
{
|
|
/* Register your solution function */
|
|
registerTransFunction(transpose_submit, transpose_submit_desc);
|
|
|
|
/* Register any additional transpose functions */
|
|
registerTransFunction(trans, trans_desc);
|
|
|
|
}
|
|
|
|
/*
|
|
* is_transpose - This helper function checks if B is the transpose of
|
|
* A. You can check the correctness of your transpose by calling
|
|
* it before returning from the transpose function.
|
|
*/
|
|
int is_transpose(int M, int N, int A[N][M], int B[M][N])
|
|
{
|
|
int i, j;
|
|
|
|
for (i = 0; i < N; i++) {
|
|
for (j = 0; j < M; ++j) {
|
|
if (A[i][j] != B[j][i]) {
|
|
return 0;
|
|
}
|
|
}
|
|
}
|
|
return 1;
|
|
}
|
|
|