pip install pymssql
pip install sqlalchemy
# 不需要修改
import pymssql
import pandas as pd
import numpy as np
import sqlalchemy
from sqlalchemy import create_engine
from DBInfo import server,user,password,db,courseYear
from hashlib import md5
# 数据库连接
connect = pymssql.connect(server,user,password,db,charset='GB18030') #解决乱码问题
if connect:
print("连接成功!")
engine = create_engine('mssql+pymssql://' + user + ':' + password + '@' + server + '/' + db)
cursor = connect.cursor() #创建一个游标对象,python里的sql语句都要通过cursor来执行
# 不需要修改
# 插入数据的方式
from sqlalchemy.types import VARCHAR,DECIMAL
def setdtypedict(df):
dtypedict = {}
for i, j in zip(df.columns, df.dtypes):
if "object" in str(j):
dtypedict.update({i:VARCHAR(256)})
if "float" in str(j):
dtypedict.update({i:DECIMAL(19, 2)})
if "int" in str(j):
dtypedict.update({i:DECIMAL(19)})
return dtypedict
# 仅修改下面代码中的组名即可,然后再执行即可
# 注册一个组名,组名不能重复;注册后也不能删除;请想好之后再注册
groupID = '老师的贝叶斯'
# 注册组名语句,不需要修改
groupDf = pd.DataFrame({'groupID':[groupID],'courseYear':[courseYear]});
d_type=setdtypedict(groupDf)
groupDf.to_sql('groups', engine, index=False, if_exists='append', dtype=d_type, chunksize=10)
# 不需要修改
# 查询组名注册的情况,由于Python3会内部转码到utf-8导致乱码,所以转成md5码到服务器上去比较
obj = md5()
obj.update(groupID.encode("GB2312"))
bs = obj.hexdigest()
sql = "select * from groups where courseYear=" + str(courseYear) + " and sys.fn_varBinToHexStr(HASHBYTES('MD5',groupID))='0x" + bs + "'"
print(pd.read_sql(sql, connect).sort_values(by=['courseYear'],ascending=False))
# 给注册的小组添加成员;
# 注意groupID的个数要和学号个数一致;
# 小组必须已经注册好;学号必须是班里同学的学号,否则无法添加;
groupMemDf = pd.DataFrame({'groupID':[groupID,groupID,groupID],
'stuID':['0000000000','0000000001','0000000002']
});
d_type=setdtypedict(groupMemDf)
groupMemDf.to_sql('groupMembers', engine, index=False, if_exists='append', dtype=d_type, chunksize=10)
# 不需要修改
# 查询添加小组成员信息的情况
sql = "select m.*,s.stuName,s.className from groupMembers m left outer join students s on m.stuID=s.stuID where sys.fn_varBinToHexStr(HASHBYTES('MD5',groupID))='0x" + bs + "';"
print(pd.read_sql(sql, connect).sort_values(by=['stuID'],ascending=True))
# 不修要修改
# 统计数据行数
sql = "select count(1) from hotelReview;"
cursor.execute(sql) #执行sql语句
row = cursor.fetchone() #读取查询结果,
print("hotelReview表的数据共%s行" % (row[0])) #输出结果
sql = "select count(1) from hotelReviewTest;"
cursor.execute(sql) #执行sql语句
row = cursor.fetchone() #读取查询结果,
print("hotelReviewTest表的数据共%s行" % (row[0])) #输出结果
# 不需要修改
# 读取数据到hotelReview数据到dataframe
sql = "select * from hotelReview;"
dfReview = pd.read_sql(sql, connect)
print("\nhotelReview表的基本信息:")
print(dfReview.info())
print("\nhotelReview表的样例数据:")
print(dfReview.head(10))
# 不需要修改
# 读取数据到hotelReviewTest数据到dataframe
sql = "select * from hotelReviewTest;"
dfReviewTest = pd.read_sql(sql, connect)
print("\nhotelReviewTest表的基本信息:")
print(dfReviewTest.info())
print("\nhotelReviewTest表的样例数据:")
print(dfReviewTest.head(10))
# 不需要修改
# 查看用于提交答案的hotelReviewResults表
sql = "select * from hotelReviewResults;"
dfReviewResult = pd.read_sql(sql, connect)
print("\nhotelReviewResults表的基本信息:")
print(dfReviewResult.info())
print("\nhotelReviewResults表的样例数据:")
print(dfReviewResult.head(5))
import csv
# dataframe中的数据存储到本地
# 实验数据不得外传,不得用于除了课程实验外的其它目的
dfReview.to_csv("data\\hotelReview.csv", sep='\t', encoding='utf-8', quoting=csv.QUOTE_NONE, index=False)
dfReviewTest.to_csv("data\\hotelReviewTest.csv", sep='\t', encoding='utf-8', quoting=csv.QUOTE_NONE, index=False)
import pandas as pd
import numpy
# 读取本地csv文件到dataframe
dfReview = pd.read_csv("data\\hotelReview.csv", sep='\t', encoding='utf-8')
dfReviewTest = pd.read_csv("data\\hotelReviewTest.csv", sep='\t', encoding='utf-8')
display(dfReview.head(),dfReviewTest.head())
# 分词和预处理
import jieba
# 停用词表
# 停用词可能会过滤掉情感词,比如 好 之类的
stoplist = list(pd.read_csv('data/停用词.txt', names = ['w'], sep = 'aaa',
encoding = 'utf-8', engine='python').w)
# 去掉停用词的分词方法
def m_cut(intxt):
return [ w for w in jieba.cut(intxt)
if w not in stoplist and len(w) > 1 ]
# 分词
cuttxt = lambda x: " ".join(m_cut(x))
dfReview["cleantxt"] = dfReview.review.apply(cuttxt)
dfReview.head()
# 构建文档-词频矩阵
from sklearn.feature_extraction.text import CountVectorizer
countvec = CountVectorizer()
wordmtx = countvec.fit_transform(dfReview.cleantxt)
# print(wordmtx)
wordmtx
# 将数据集划分为 训练集和测试集
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(wordmtx, dfReview.label,
test_size = 0.3, random_state = 123)
# 朴素贝叶斯模型
from sklearn import naive_bayes
NBmodel = naive_bayes.MultinomialNB()
NBmodel.fit(x_train, y_train)
NBmodel.predict(x_test) #模型应用
# 预测准确率(给模型打分)
print('训练集:', NBmodel.score(x_train, y_train),
',验证集:', NBmodel.score(x_test, y_test))
# 测试模型的有效性
# 将需要预测的文本转换为和建模时格式完全对应的d2m矩阵格式,随后即可进行预测。
string = "真是个垃圾酒店,又贵又不好"
words = " ".join(m_cut(string))
words_vecs = countvec.transform([words]) # countVec存储了数据转换模式;
words_vecs
print(NBmodel.predict(words_vecs))
# 应用模型开展测试数据的计算
# 分词
dfReviewTest["cleanTxt"] = dfReviewTest.review.astype(str).apply(cuttxt)
# 转文档词频矩阵
test_vecs = countvec.transform(dfReviewTest.cleanTxt)
# 预测结果
dfReviewTest["predict_label"] = NBmodel.predict(test_vecs)
# 查看预测结果
print(dfReviewTest)
# 构造输出结果
# 输出结果的格式:teamID, batchID, ID, label
batch = 101
dfResult = dfReviewTest[['ID','predict_label']]
dfResult.insert(0,'teamID',groupID)
dfResult.insert(1,'batch',batch)
dfResult.rename(columns={'predict_label':'label'},inplace=True)
print(dfResult)
# 不需要修改
# 向数据库上传答案,即上述dfResult
d_type=setdtypedict(dfResult)
dfResult.to_sql('hotelReviewResults', engine, index=False, if_exists='append', dtype=d_type, chunksize=1000)
# 不需要修改
# 查看A排行榜 hotelReviewScoreSheetA是视图
# 最终排行榜是B榜,是全部对象的结果
sql = "select * from hotelReviewScoreSheetA where sys.fn_varBinToHexStr(HASHBYTES('MD5',teamID))='0x" + bs + "';"
print(pd.read_sql(sql, connect).sort_values(by=['F1Score'],ascending=False))
# 不需要修改
# 进一步汇总的结果视图 hotelReviewScoreSheetASummary,供大家快速查看排名情况
sql = "select A.* from hotelReviewScoreSheetASummary A left outer join groups G on A.teamID=G.groupID where courseYear=" + str(courseYear)
print(pd.read_sql(sql, connect).sort_values(by=['F1Score'],ascending=False))
# 不需要修改
# 关闭数据库连接
engine.dispose
cursor.close()
connect.close()
竞赛的组织
打分方式