在机器学习中,mnist数据集是一个使用很广泛的数据集,一般教程中都是python对mnist数据集的解析,在本文中将采用C++语言进行手写数据集的数据提取。
环境准备:
vs2015
OpenCV4.5.0
首先在mnist官网上下载mnist数据集的压缩包。

下载完成后将压缩包解压得到idx3-ubyte格式文件,如下图所示:

下面代码为C++版解析代码:
#include<iostream>
#include<opencv.hpp>
#include <string>
#include <fstream>
using namespace std;
using namespace cv;
//小端存储转换
int reverseInt(int i);
//读取image数据集信息
Mat read_mnist_image(const string fileName);
//读取label数据集信息
Mat read_mnist_label(const string fileName);
string train_images_path = "G:/vs2015_opencv_ml/mnist/train-images.idx3-ubyte";
string train_labels_path = "G:/vs2015_opencv_ml/mnist/train-labels.idx1-ubyte";
int main()
{
//读取标签数据 (60000,1) 类型为int32
Mat train_labels = read_mnist_label(train_labels_path);
//打印第一个标签
Mat label0 = train_labels.rowRange(0, 1);
cout << "第一个标签:" << label0 << endl;
//读取图像数据 (60000,784) 类型为float32 数据未归一化
Mat train_images = read_mnist_image(train_images_path);
//取出第0张图像
Mat img0 = train_images.rowRange(0, 1);//(1,784)
//改变形状为(28,28)
img0 = img0.reshape(1, 28);
//显示图像
cv::imshow("test", img0);
cv::waitKey(0);
getchar();
return 0;
}
;
int reverseInt(int i) {
unsigned char c1, c2, c3, c4;
c1 = i & 255;
c2 = (i >> 8) & 255;
c3 = (i >> 16) & 255;
c4 = (i >> 24) & 255;
return ((int)c1 << 24) + ((int)c2 << 16) + ((int)c3 << 8) + c4;
}
Mat read_mnist_image(const string fileName) {
int magic_number = 0;
int number_of_images = 0;
int n_rows = 0;
int n_cols = 0;
Mat DataMat;
ifstream file(fileName, ios::binary);
if (file.is_open())
{
cout << "成功打开图像集 ..." << endl;
file.read((char*)&magic_number, sizeof(magic_number));//幻数(文件格式)
file.read((char*)&number_of_images, sizeof(number_of_images));//图像总数
file.read((char*)&n_rows, sizeof(n_rows));//每个图像的行数
file.read((char*)&n_cols, sizeof(n_cols));//每个图像的列数
magic_number = reverseInt(magic_number);
number_of_images = reverseInt(number_of_images);
n_rows = reverseInt(n_rows);
n_cols = reverseInt(n_cols);
cout << "幻数(文件格式):" << magic_number
<< " 图像总数:" << number_of_images
<< " 每个图像的行数:" << n_rows
<< " 每个图像的列数:" << n_cols << endl;
cout << "开始读取Image数据......" << endl;
DataMat = Mat::zeros(number_of_images, n_rows * n_cols, CV_32FC1);
for (int i = 0; i < number_of_images; i++) {
for (int j = 0; j < n_rows * n_cols; j++) {
unsigned char temp = 0;
file.read((char*)&temp, sizeof(temp));
//可以在下面这一步将每个像素值归一化
float pixel_value = float(temp);
//按照行将像素值一个个写入Mat中
DataMat.at<float>(i, j) = pixel_value;
}
}
cout << "读取Image数据完毕......" << endl;
}
file.close();
return DataMat;
}
Mat read_mnist_label(const string fileName) {
int magic_number;
int number_of_items;
Mat LabelMat;
ifstream file(fileName, ios::binary);
if (file.is_open())
{
cout << "成功打开标签集 ... " << endl;
file.read((char*)&magic_number, sizeof(magic_number));
file.read((char*)&number_of_items, sizeof(number_of_items));
magic_number = reverseInt(magic_number);
number_of_items = reverseInt(number_of_items);
cout << "幻数(文件格式):" << magic_number << " ;标签总数:" << number_of_items << endl;
cout << "开始读取Label数据......" << endl;
//CV_32SC1代表32位有符号整型 通道数为1
LabelMat = Mat::zeros(number_of_items, 1, CV_32SC1);
for (int i = 0; i < number_of_items; i++) {
unsigned char temp = 0;
file.read((char*)&temp, sizeof(temp));
LabelMat.at<unsigned int>(i, 0) = (unsigned int)temp;
}
cout << "读取Label数据完毕......"<<endl;
}
file.close();
return LabelMat;
}
运行程序,执行结果如下:

1万+

被折叠的 条评论
为什么被折叠?



