当我们使用MinGW-w64作为编译器在windows系统环境下进行C语言编程时,如果源代码文件(.c)保存格式为Ansi格式,则在打印汉字时不会出现乱码;反之,如果我们使用UTF-8格式保存,则会出现乱码,需要在编译时加上“-fexec-charset=gbk”来解决乱码问题。
#include
int main()
{
printf("%s\n", "你好,世界!");
return 0;
}
![](https://img2018.cnblogs.com/common/1600095/202002/1600095-20200221125242062-1643349898.png)
![](https://img2018.cnblogs.com/common/1600095/202002/1600095-20200221125250066-1401298360.png)
![](https://img2018.cnblogs.com/common/1600095/202002/1600095-20200221125257396-1200465681.png)
![](https://img2018.cnblogs.com/common/1600095/202002/1600095-20200221125306520-498009573.png)
那么,当我们使用C语言处理文件时,如果文件格式是UTF-8,如何转换成Ansi格式呢?注意:在windows下,txt的保存格式中有UTF-8和带有BOM的UTF-8两个选项。如果选择带有BOM的UTF-8选项,则在读取文件时会首先读到三个标志着这个文件是由UTF-8格式编码的字符,分别为EF BB BF。如果选择UTF-8,则不会有这三个字符,而是直接一开始就读到文件本身内容。详情请看:https://blog.csdn.net/wang544831183/article/details/82349668
![](https://img2018.cnblogs.com/common/1600095/202002/1600095-20200221125358094-1156243525.png)
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1 // char.h
2 // 头文件
3 // 定义结构体File,存储文件信息
4 // 定义结构体初始化函数以及其他函数
5
6 #ifndef _H_CHAR_
7 #define _H_CHAR_
8
9 #include
10 #include
11 #include
12 #include
13
14 unsigned char BOM[] = {0xEF, 0xBB, 0xBF}; // 带有BOM的UTF-8编码格式的文件会以这三个字符开头
15
16 // File结构体定义
17 typedef struct file {
18 char *filename;
19 char *filemode;
20 char *encoding;
21 bool readable;
22 } File;
23
24 // 函数定义
25 File *initializeFile(char *, char *); // 初始化结构体
26 bool isUTF8(char *); // 判断文件是否为UTF-8编码格式
27 bool isReadable(char *); // 判断filemode是否支持可读选项
28 bool charInString(char, char *); // 判断字符是否在字符串中
29 void fprint(File *); // 打印结构体内容
30 void fprintInfo(File *); // 打印结构体信息
31 void deconstructFile(File *); // 析构化结构体
32 void fprintAnsi(File *); // 以Ansi格式打印
33 // 转换函数定义
34 void Convert(const char* strIn, char* strOut, int sourceCodepage, int targetCodepage);
35 void UTF82Ansi(PCHAR Src, PCHAR Dst);
36 void Ansi2UTF8(PCHAR Src, PCHAR Dst);
37
38 // 函数实现
39 File *initializeFile(char *filename, char *filemode)
40 {
41 File *file = (File *)malloc(sizeof(File));
42 file->filename = filename;
43 file->filemode = filemode;
44 file->encoding = "Ansi";
45 file->readable = false;
46
47 if (isUTF8(filename))
48 {
49 file->encoding = "UTF-8";
50 }
51
52 if (isReadable(filemode))
53 {
54 file->readable = true;
55 }
56
57 return file;
58 }
59
60 bool isUTF8(char *filename)
61 {
62 FILE *fp = fopen(filename, "r");
63
64 if (fp == NULL)
65 {
66 printf("Unable to open file %s\n", filename);
67 return false;
68 }
69
70 unsigned char *buffer = (unsigned char *)malloc(sizeof(unsigned char) * 3);
71 fread(buffer, 3, 3, fp);
72 fclose(fp);
73
74 for (int i = 0; i < 3; i++)
75 {
76 if (buffer[i] != BOM[i])
77 {
78 return false;
79 }
80 }
81
82 return true;
83 }
84
85 bool isReadable(char *filemode)
86 {
87 if (charInString('r', filemode) || charInString('+', filemode))
88 {
89 return true;
90 }
91 return false;
92 }
93
94 bool charInString(char ch, char *str)
95 {
96 while (*str != '\0')
97 {
98 if (ch == *str)
99 {
100 return true;
101 }
102 ++str;
103 }
104 return false;
105 }
106
107 void fprintInfo(File *file)
108 {
109 printf("filename: %s\nfilemode: %s\nencoding: %s\nreadble: %d\n", file->filename, file->filemode, file->encoding, file->readable);
110 }
111
112 void fprint(File *file)
113 {
114 // 只有filemode中存在'r'或者'+'字符时,才可以调用fprint函数
115 if (!file->readable)
116 {
117 return;
118 }
119
120 FILE *fp = fopen(file->filename, file->filemode);
121 char chr;
122
123 if (fp == NULL)
124 {
125 printf("Failed to open file %s\n", file->filename);
126 return;
127 }
128
129 while ((chr = fgetc(fp)) != EOF)
130 {
131 printf("%c", chr);
132 }
133
134 puts("");
135 fseek(fp, 0, SEEK_SET); // 重置到开头
136
137 while ((chr = fgetc(fp)) != EOF)
138 {
139 printf("%x ", (unsigned char)chr);
140 }
141 printf("\n\n");
142
143 fclose(fp);
144 }
145
146 void deconstructFile(File *file)
147 {
148 free(file);
149 file = NULL;
150 }
151
152 void fprintAnsi(File *file)
153 {
154 if (!file->readable)
155 {
156 return;
157 }
158 if (file->encoding == "Ansi")
159 {
160 fprint(file);
161 }
162 else
163 {
164 FILE *fp = fopen(file->filename, file->filemode);
165
166 int i = 0;
167 char ch;
168 char utf8[MAX_PATH] = {0};
169 char ansi[MAX_PATH] = {0};
170
171 // 跳过头三个BOM字符
172 fseek(fp, 3, SEEK_SET);
173 while ((ch = fgetc(fp)) != EOF)
174 {
175 utf8[i++] = ch;
176 }
177 utf8[i] = '\0';
178 UTF82Ansi(utf8, ansi);
179 printf("%s\n", ansi);
180
181 fclose(fp);
182 fp = NULL;
183 }
184 }
185
186 // 编码格式转换函数
187 void Convert(const char* strIn, char* strOut, int sourceCodepage, int targetCodepage)
188 {
189 int len = lstrlenA(strIn);
190 int unicodeLen = MultiByteToWideChar(sourceCodepage, 0, strIn, -1, NULL, 0);
191 wchar_t pUnicode[1024] = {0};
192 MultiByteToWideChar(sourceCodepage, 0, strIn, - 1, (LPWSTR)pUnicode, unicodeLen);
193
194 BYTE pTargetData[2048] = {0};
195 int targetLen = WideCharToMultiByte(targetCodepage, 0, (LPWSTR)pUnicode, -1, (char*)pTargetData,0, NULL, NULL);
196 WideCharToMultiByte(targetCodepage, 0, (LPWSTR)pUnicode, -1,(char*)pTargetData, targetLen, NULL, NULL);
197 lstrcpyA(strOut,(char*)pTargetData);
198 }
199 void UTF82Ansi(PCHAR Src, PCHAR Dst)
200 {
201 Convert(Src,Dst,CP_UTF8,CP_ACP);
202 }
203 void Ansi2UTF8(PCHAR Src, PCHAR Dst)
204 {
205 Convert(Src,Dst,CP_ACP, CP_UTF8);
206 }
207
208 #endif
char.h
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1 #include "char.h"
2
3 int main()
4 {
5 File *ansi = initializeFile("ansi.txt", "r");
6 fprintInfo(ansi);
7 fprint(ansi);
8 deconstructFile(ansi);
9
10 File *utf8 = initializeFile("utf-8-bom.txt", "r");
11 fprintInfo(utf8);
12 fprint(utf8);
13 fprintAnsi(utf8);
14 deconstructFile(utf8);
15
16 return 0;
17 }
main.c
![](https://img2018.cnblogs.com/common/1600095/202002/1600095-20200220235702779-306102016.png)
|