optimize cachelab

This commit is contained in:
18218461270@163.com 2025-08-30 23:39:10 +08:00
parent 738bb9531f
commit f17d89f0d3

231
cache/trans.c vendored
View File

@ -20,140 +20,125 @@ int is_transpose(int M, int N, int A[N][M], int B[M][N]);
* be graded. * be graded.
*/ */
char transpose_submit_desc[] = "Transpose submission"; char transpose_submit_desc[] = "Transpose submission";
void transpose_sub(int N, int t, int l, int size, int B[][N]) { void block_8x8_32(int t, int l, int A[32][32], int B[32][32]) {
for (int i = 0; i < size; i++) { for (int i = t; i < t + 8; i++) {
for (int j = i + 1; j < size; j++) { for (int j = l; j < l + 8; j++) {
B[t + i][l + j] ^= B[t + j][l + i]; B[j][i] = A[i][j];
B[t + j][l + i] ^= B[t + i][l + j]; }
B[t + i][l + j] ^= B[t + j][l + i]; }
}
void block_8x8_64(int t, int l, int A[64][64], int B[64][64]) {
int v0, v1, v2, v3;
for (int i = t; i < t + 4; i++) {
for (int j = l; j < l + 4; j++) {
B[j][i] = A[i][j];
}
for (int j = l + 4; j < l + 8; j++) {
B[j - 4][i + 4] = A[i][j];
}
}
for (int j = l; j < l + 4; j++) {
v0 = B[j][t + 4];
v1 = B[j][t + 5];
v2 = B[j][t + 6];
v3 = B[j][t + 7];
B[j][t + 4] = A[t + 4][j];
B[j][t + 5] = A[t + 5][j];
B[j][t + 6] = A[t + 6][j];
B[j][t + 7] = A[t + 7][j];
B[j + 4][t] = v0;
B[j + 4][t + 1] = v1;
B[j + 4][t + 2] = v2;
B[j + 4][t + 3] = v3;
}
for (int i = t + 4; i < t + 8; i++) {
for (int j = l + 4; j < l + 8; j++) {
B[j][i] = A[i][j];
} }
} }
} }
void transpose_submit(int M, int N, int A[N][M], int B[M][N]) { void transpose_submit(int M, int N, int A[N][M], int B[M][N]) {
int v0, v1, v2, v3, v4, v5, v6, v7;
if (M == 32) { if (M == 32) {
for (int j = 0; j < M; j += 8) { for (int i = 0; i < N; i += 8) {
for (int i = 0; i < N; i++) { for (int ii = i; ii < i + 8; ii++) {
v0 = A[i][j]; for (int jj = i; jj < i + 8; jj++) {
v1 = A[i][j + 1]; B[ii][(jj + 8) % N] = A[ii][jj];
v2 = A[i][j + 2]; }
v3 = A[i][j + 3]; }
v4 = A[i][j + 4]; for (int ii = i; ii < i + 8; ii++) {
v5 = A[i][j + 5]; for (int jj = i; jj < i + 8; jj++) {
v6 = A[i][j + 6]; B[jj][ii] = B[ii][(jj + 8) % N];
v7 = A[i][j + 7]; }
}
B[j][i] = v0; block_8x8_32((i + 8) % N, i, A, B);
B[j + 1][i] = v1; }
B[j + 2][i] = v2;
B[j + 3][i] = v3; for (int i = 0; i < N; i += 8) {
B[j + 4][i] = v4; for (int j = 0; j < M; j += 8) {
B[j + 5][i] = v5; if (i != j && (j + 8) % N != i) {
B[j + 6][i] = v6; block_8x8_32(i, j, A, B);
B[j + 7][i] = v7; }
} }
} }
} else if (M == 64) { } else if (M == 64) {
for (int i = 0; i < N; i += 8) {
int v0, v1, v2, v3;
for (int ii = i; ii < i + 4; ii++) {
for (int jj = i; jj < i + 8; jj++) {
B[ii][(jj + 8) % N] = A[ii][jj];
}
}
for (int ii = i + 4; ii < i + 8; ii++) {
for (int jj = i; jj < i + 8; jj++) {
B[ii - 4][(jj + 16) % N] = A[ii][jj];
}
}
for (int ii = i; ii < i + 4; ii++) {
for (int jj = i; jj < i + 4; jj++) {
B[jj][ii] = B[ii][(jj + 8) % N];
}
for (int jj = i + 4; jj < i + 8; jj++) {
B[jj - 4][ii + 4] = B[ii][(jj + 8) % N];
}
}
for (int jj = i; jj < i + 4; jj++) {
v0 = B[jj][i + 4];
v1 = B[jj][i + 5];
v2 = B[jj][i + 6];
v3 = B[jj][i + 7];
B[jj][i + 4] = B[i][(jj + 16) % N];
B[jj][i + 5] = B[i + 1][(jj + 16) % N];
B[jj][i + 6] = B[i + 2][(jj + 16) % N];
B[jj][i + 7] = B[i + 3][(jj + 16) % N];
B[jj + 4][i] = v0;
B[jj + 4][i + 1] = v1;
B[jj + 4][i + 2] = v2;
B[jj + 4][i + 3] = v3;
}
for (int ii = i + 4; ii < i + 8; ii++) {
for (int jj = i + 4; jj < i + 8; jj++) {
B[jj][ii] = B[ii - 4][(jj + 16) % N];
}
}
block_8x8_64((i + 8) % N, i, A, B);
block_8x8_64((i + 16) % N, i, A, B);
}
for (int i = 0; i < N; i += 8) { for (int i = 0; i < N; i += 8) {
for (int j = 0; j < M; j += 8) { for (int j = 0; j < M; j += 8) {
if (i == j) { if (i != j && (j + 8) % N != i && (j + 16) % N != i) {
for (int ii = i; ii < i + 4; ii++) { block_8x8_64(i, j, A, B);
v0 = A[ii][j];
v1 = A[ii][j + 1];
v2 = A[ii][j + 2];
v3 = A[ii][j + 3];
v4 = A[ii][j + 4];
v5 = A[ii][j + 5];
v6 = A[ii][j + 6];
v7 = A[ii][j + 7];
B[ii][j] = v0;
B[ii][j + 1] = v1;
B[ii][j + 2] = v2;
B[ii][j + 3] = v3;
B[ii][j + 4] = v4;
B[ii][j + 5] = v5;
B[ii][j + 6] = v6;
B[ii][j + 7] = v7;
}
transpose_sub(N, i, j, 4, B);
transpose_sub(N, i, j + 4, 4, B);
for (int ii = i + 4; ii < i + 8; ii++) {
v0 = A[ii][j];
v1 = A[ii][j + 1];
v2 = A[ii][j + 2];
v3 = A[ii][j + 3];
v4 = A[ii][j + 4];
v5 = A[ii][j + 5];
v6 = A[ii][j + 6];
v7 = A[ii][j + 7];
B[ii][j] = v0;
B[ii][j + 1] = v1;
B[ii][j + 2] = v2;
B[ii][j + 3] = v3;
B[ii][j + 4] = v4;
B[ii][j + 5] = v5;
B[ii][j + 6] = v6;
B[ii][j + 7] = v7;
}
transpose_sub(N, i + 4, j, 4, B);
transpose_sub(N, i + 4, j + 4, 4, B);
for (int ii = i; ii < i + 4; ii++) {
v0 = B[ii + 4][j];
v1 = B[ii + 4][j + 1];
v2 = B[ii + 4][j + 2];
v3 = B[ii + 4][j + 3];
v4 = B[ii][j + 4];
v5 = B[ii][j + 5];
v6 = B[ii][j + 6];
v7 = B[ii][j + 7];
B[ii][j + 4] = v0;
B[ii][j + 5] = v1;
B[ii][j + 6] = v2;
B[ii][j + 7] = v3;
B[ii + 4][j] = v4;
B[ii + 4][j + 1] = v5;
B[ii + 4][j + 2] = v6;
B[ii + 4][j + 3] = v7;
}
} else {
for (int ii = i; ii < i + 4; ii++) {
for (int jj = j; jj < j + 4; jj++) {
B[jj][ii] = A[ii][jj];
}
for (int jj = j + 4; jj < j + 8; jj++) {
B[jj - 4][ii + 4] = A[ii][jj];
}
}
for (int jj = j; jj < j + 4; jj++) {
v0 = B[jj][i + 4];
v1 = B[jj][i + 5];
v2 = B[jj][i + 6];
v3 = B[jj][i + 7];
B[jj][i + 4] = A[i + 4][jj];
B[jj][i + 5] = A[i + 5][jj];
B[jj][i + 6] = A[i + 6][jj];
B[jj][i + 7] = A[i + 7][jj];
B[jj + 4][i] = v0;
B[jj + 4][i + 1] = v1;
B[jj + 4][i + 2] = v2;
B[jj + 4][i + 3] = v3;
}
for (int ii = i + 4; ii < i + 8; ii++) {
for (int jj = j + 4; jj < j + 8; jj++) {
B[jj][ii] = A[ii][jj];
}
}
} }
} }
} }