update: labeling for filter

This commit is contained in:
alikia2x (寒寒) 2025-01-29 05:22:19 +08:00
parent cd9ae4a481
commit b6a5c3dfc4
Signed by: alikia2x
GPG Key ID: 56209E0CCD8420C6
5 changed files with 232 additions and 93 deletions

View File

@ -915,4 +915,19 @@
{"aid": 685634071, "title": "【AI歌手】腐草为萤【Cover】", "description": "原唱:银临\n引擎歌叽歌叽\nPV借用av2700587\nPV星の祈\n绘otakucake阿饼P站pixiv.me/otakucake", "tags": ["AI", "歌叽歌叽"], "author_info": "一支粉笔w: 人生难得一知己,千古知音最难求。 | 网易云:粉笔 微博http://weibo.com/Chalk10", "label": 2}
{"aid": 3372898, "title": "【言和英文】Counting Stars", "description": "自制 终于投了。1.封面仍然灵魂p图。2.这玩意儿9月底就在做了说好10月投pv还解决不了。说好11月投但又因为曲子本身又拖了。上个星期想投结果电脑坏掉加发烧。。。。。所以一路拖到今天才投。算是放下心中的一块石头了吧。3.这次画了很多参数感觉有提升不知道朱军是怎么看的。5.counting stars是我最最最喜欢的一首歌已经循环一年。6.1R赶紧出4专", "tags": ["VOCALOID", "COUNTING STARS", "言和", "ONEREPUBLIC"], "author_info": "JackChenZz: 随缘更新", "label": 0}
{"aid": 589575, "title": "【洛天依】约定", "description": "绫:天依,不管几年之后你能否记得我,我会一直恋你,爱你,铭心记住你! QAQ 被小汐撺掇调的,咱非常不擅长这拐来拐去的长调式滴说,不过这次应该算不上渣吧?(好吧- -无视这句。翻的周慧的《约定》谱子依旧自己来其实只是自己重做MIDI了网上的MIDI做觉得不好用调子太高3分43秒的字幕是UP无聊的XXX。PV用慢放的《春来发几只》弄的剪掉了两组燕子镜头~ ~赶脚两个曲子的BPM同步了", "tags": ["乐正绫", "洛天依", "洛天依翻唱曲", "VOCALOID", "良调教", "约定", "周慧"], "author_info": "星璇の天空: Dr.冥月星璇 成就不会调教的调教师U/V/SV/C)专业咸鱼不会科研的副研究员不会编程的系统架构师不会AI的无人机设计师...", "label": 2}
{"aid": 411693256, "title": "【言和V5】独二无一【VOCALOID Cover】", "description": "人声本家https://www.bilibili.com/video/BV1oJ411u7Xq?t=222.3\nVC本家https://www.bilibili.com/video/BV12J411u7Aa?t=2.8", "tags": ["Vsinger创作激励计划", "VOCALOID CHINA", "VOCALOID", "言和", "虚拟歌手", "独二无一", "Vsinger创作激励计划第六期"], "author_info": "远宇V: 言和vsinger\njiafei同人扣扣群:121249682", "label": 0}
{"aid": 411693256, "title": "【言和V5】独二无一【VOCALOID Cover】", "description": "人声本家https://www.bilibili.com/video/BV1oJ411u7Xq?t=222.3\nVC本家https://www.bilibili.com/video/BV12J411u7Aa?t=2.8", "tags": ["Vsinger创作激励计划", "VOCALOID CHINA", "VOCALOID", "言和", "虚拟歌手", "独二无一", "Vsinger创作激励计划第六期"], "author_info": "远宇V: 言和vsinger\njiafei同人扣扣群:121249682", "label": 0}{"aid": 113244177962166, "title": "【スーサイドSuicide× 头痛欲绝】头痛到自 杀了呢", "description": "-", "tags": ["重音テト", "頭ン痛", "头痛欲绝", "スーサイド!", "Suicide", "teto"], "author_info": "硕果-累累: 该up主视频可能有长时间断更不可催更因为要上课不足之处欢迎提出意见", "label": 0}
{"aid": 791366733, "title": "menfnfn", "description": "-", "tags": ["让虚拟歌手为你唱", "虚拟之声创作计划·2023第四期"], "author_info": "bili_70413135045: ", "label": 0}
{"aid": 233901881, "title": "《父亲的草原母亲的河》-降D调-钢琴伴奏-正谱伴奏-动态谱", "description": "声乐作品钢琴正谱伴奏视频,音频纯钢琴录制,视频动态走带条手工调校,让声乐教学更高效,练歌更方便。\n学生学歌神器老师教学助手\n说明降调伴奏仅为音频降调曲谱仍为原调需要原调伴奏可进入个人空间搜索……", "tags": ["虚拟歌手分享官", "教学", "音乐", "正谱伴奏", "和伴奏", "虚拟之声创作计划·2023第三期", "钢琴伴奏", "培训", "歌曲伴奏", "声乐伴奏", "必剪创作"], "author_info": "音乐视频玩家: 来一首说唱就唱的歌曲吧,音乐、歌词都准备好了! \n\n#…分享各类艺术歌曲及伴奏…#", "label": 0}
{"aid": 91991603, "title": "【洛天依】洛天依 - 盛开季节", "description": "作品类型: 原创曲\n调教: 阿宽\n作词: 阿宽\n作曲: 阿宽\n编曲/混音: 阿宽\n简介补充: 创作于10.03.10新编于20.02.26。。。我现在是多无聊。。。。_(:з」∠)_", "tags": ["VOCALOID", "VOCALOIDCHINA", "原创歌曲", "原创曲", "自制", "洛天依原创曲", "洛天依"], "author_info": "arqalm: 我的网易音乐人主页https://music.163.com/#/artist?id=12126188", "label": 1}
{"aid": 1406015081, "title": "「 AI 孙燕姿 」《请你一定要记得我好吗》cover 房东的猫", "description": "免责声明:\n本作品仅作为娱乐目的发布可能造成的后果与使用的音声转换项目的作者、贡献者无关\n本视频使用AI音色替换技术(Sovits4.0),视频中演唱人声并非孙燕姿本人。\n如有侵权告知删除\n原唱请你一定要记得我好吗-房东的猫\nAI声源孙燕姿", "tags": ["AI", "孙燕姿", "COVER", "翻唱", "房东的猫"], "author_info": "无语猫猫没有鱼: 要好好吃饭哦", "label": 2}
{"aid": 236266867, "title": "在一起真好|一緒がよかった / 重音テトSV【SynthV オリジナル】", "description": "https://youtu.be/8H8P_w4OAmo\nUPHaqua - はふれつ\n一緒がよかった / 重音テトSV【SynthV オリジナル】\n11月22日\n2023年\n関係ないけどT1優勝おめでとう", "tags": ["携手虚拟歌手征服世界吧!", "虚拟歌手", "重音テト", "SynthV"], "author_info": "音街ウナ: AI觉醒切蒲英\n ——2021.2.9\n是音街鳗切蒲英双推人也看冷门歌姬和VUP\n有想让我搬运的油管视频欢迎来推荐哦", "label": 0}
{"aid": 212218392, "title": "【鏡音レン】嫉妬[short ver]【yoh】", "description": "sm40137018\n投稿日時 2022/03/06 18:00\n嫉妬\n\nvo : 鏡音レン\n\nillust : 조둔이 \nhttps://twitter.com/JODOONI\n\nmusic & video : yoh\nhttps://twitter.com/yoh_hidamari\n\noff vo→https://www.dropbox.com/s/wbvl5gew4oxb5u4/%E5%AB%89%E5%A6%AC%20off.mp3?dl=0", "tags": ["VOCALOID", "鏡音レン", "yoh", "嫉妬"], "author_info": "路过的某咸鱼: ", "label": 0}
{"aid": 18174891, "title": "【乐正龙牙&洛天依】凉凉(考试版) 献给挂科或者即将挂科的小伙伴们", "description": "喜欢的话就推荐、投币、收藏、订阅素质4连吧~~\r\n翻调曲\r\npv:DS绝对小贱\r\n重作词:@凸凸你034\r\n英文翻译:@凸凸你034\r\n调教:DS绝对小贱\r\n新浪微博http://weibo.com/dsxiaojian2000\n\r\n我优酷空间地址http://i.youku.com/i/UMTIwNjYwNTQwOA==", "tags": ["翻译大家", "洛天依", "翻唱", "乐正龙牙", "VOCALOID中文曲", "VOCALOID", "合唱", "开口跪", "凉凉", "重作词"], "author_info": "DS绝对小贱: 鬼畜区的宝藏up 中国版《我的世界》DoMCer服务器 Admin", "label": 2}
{"aid": 1505358010, "title": "【洛天依V4J】熱異常【VOCALOIDカバー】", "description": "■重新调了一下…上次听的怪怪的是因为加了自然音效,这次对整体都做了一定的修正和更改\n\n⚠☞但是声音还是有点小耳机党请增大音量🔊\n-----------------------------------\n■本家https://youtu.be/b2NTglk9tvI?si=XHhYTbeMtgNWNYG6\n\n☞参考的视频以及ust转vsqx\n■https://bowlroll.net/file/286986\n■https://b23.tv/9uN5ER1\n-----------------------------------\n■调音混音我\n■编辑器VOCALOID6", "tags": ["VOCALOID", "V4", "洛天依", "日语翻唱", "熱異常", "いよわP", "VOCALOIDカバー"], "author_info": "伊従蒼: ", "label": 0}
{"aid": 383098671, "title": "【雀河&长歌】《知否知否》(荼鸢:什么时候轮到我? 我:下次一定!)", "description": "原唱:胡夏/郁可唯\n翻唱雀河/长歌\n调教本人\n本来想用荼鸢可是长歌实在太适合了……", "tags": ["让虚拟歌手为你唱", "长歌", "ACE虚拟歌姬", "雀河", "中文VOCALOID", "知否知否"], "author_info": "松下_下上上左右AB: Jam project YYDS", "label": 2}
{"aid": 4983329, "title": "【洛天依原创】九歌·东皇太一", "description": "自制 1.灵感来了无法阻挡灵感欲去转瞬即逝啊朋友们。→_→ 2.对等有空了就不再做这么敷衍的pv了→_→。", "tags": ["中国风", "古风", "VOCALOID", "VOCALOID中文曲", "洛天依", "屈原"], "author_info": "十六air: 你所热爱的,就是说的道理。", "label": 1}
{"aid": 580592330, "title": "【言和v3】不安灵魂收容所short ver.【60分调声】", "description": "不安言和收容所!\n翻唱言和\n调团子\n混ngf\nvsqx719\n去年怎么已经是去年了60分调声的存稿混个新年曲了\n很粗糙的调以及跨语种并且为了能按时调完还把和声轨删了…………随便听听得了\n\n\n\n原曲BV1Nu411t7Pj\n「不安灵魂收容所」\n词曲、制作人ChiliChill\n人声录音室RisingToneStudio\n曲绘kurattes\nPV制作缘聚世代", "tags": ["Vsinger创作激励计划", "VOCALOID CHINA", "言和", "COVER", "ChiliChill", "中文VOCALOID", "Vsinger创作激励计划第六期"], "author_info": "团子_P: 不在别找不知道", "label": 2}
{"aid": 113725465957955, "title": "【鏡音リン17th】花海【中日双语翻唱】", "description": "原唱:周杰伦\n翻唱鏡音リン act1 (日语部分)\n 鏡音リン power/warm (中文部分)\n背景Project SEKAI 镜音铃17周年生贺卡面\n 自摄\nリンちゃん生日快乐\n三次元繁忙赶制的生贺翻唱混音依旧一坨。\nC社铃的新声库什么时候出", "tags": ["花海", "COVER", "镜音铃", "鏡音リン", "翻唱", "VOCALOID"], "author_info": "纯情好少年: 词不达意,表达模糊,语文不及格工科狗", "label": 2}
{"aid": 66426751, "title": "【洛天依V4J】首班车和卡夫卡【COVER】", "description": "Music·Lyricsナブナsm27057005/av2840917\nVocal: 洛天依\nManipulation多元P\nvsqxヤ-クザPav38709954\nLast Workav64325315\n\n小伙伴说了只要想吃冰无论何时都是夏天xxxxxx", "tags": ["VOCALOID", "ナブナ", "洛天依日语", "洛天依", "始発とカフカ", "首班车和卡夫卡", "n-buna"], "author_info": "多元P: 日语依不稳定产粮ing", "label": 0}
{"aid": 4121667, "title": "【洛天依】出狱新单《哈密瓜》", "description": "自制 视频:乔木 词曲编混唱:舟扒皮\r\n\r\n终于放出来了哈密瓜好好吃哦。", "tags": ["原创曲", "洛天依", "VOCALOID", "舟扒皮"], "author_info": "舟皮: ", "label": 1}
{"aid": 58775982, "title": "【洛天依】祖龙吟", "description": "以下是原曲staff\n总策划木宁木蒙\n企划运营塔库\n监制落落无尘、卿雅、小仙、顾雪柔\n作曲陈亦洺 | 作词:玄天 | 编曲Mzf小慕\n演唱星尘 | 调教:花儿不哭 | 混音Mr.曾经 | 笛子:囚牛\n视频小约酱【麻薯映像】| 曲绘:白鄔東\n朕之功过当世无可评说者百代之后自有论断。\nvsqx桂月 & 星葵 & 熊\n原曲av42348447\n天依生日快乐\n我永远喜欢洛天依", "tags": ["VOCALOIDCHINA", "洛天依", "中国风", "古风", "2019洛天依生日会"], "author_info": "洛水清平: 取次花丛懒回顾,半缘修道半缘君。 ", "label": 2}

View File

@ -101,3 +101,17 @@
{"aid": 56016654, "title": "【重音テト】あっけない【./わさび】", "tags": ["公主殿下"], "author_info": "neneneneneneneko: ", "description": "sm35282409\nボカロ23曲目。ウタウ。\n\nボカロ曲\nmylist/57784222\nインスト曲\nmylist/53204966\ntwitter\nhttps://twitter.com/wasa_giga", "model": 0, "human": 0}
{"aid": 579224517, "title": "∧…在", "tags": ["让虚拟歌手为你唱", "虚拟之声创作计划·2023第四期"], "author_info": "时序夜: ", "description": "-", "model": 0, "human": 0}
{"aid": 1351556126, "title": "フルグア GalGame姬恋~缚羽的欠片OP -- 心之芽 vocal by 初音未来", "tags": ["OP", "初音未来", "ACG音乐", "主题曲", "姬恋缚羽的欠片"], "author_info": "Chinbam_: 离开中...", "description": "GalGame姬恋~缚羽的欠片OP\n玩GAlGAME发现主题曲挺好听看站内貌似没有就投一下自己听 hhh\n作词·作曲·编曲フルグア\n视频灼弦&Afezeria&沛之暗影\n能不能过审也不知道 hh\n---侵删", "model": 0, "human": 0}
{"aid": 68443207, "title": "レジデュアム _ 鏡音リン - ニコニコ動画", "tags": ["镜音RIN"], "author_info": "neneneneneneneko: ", "description": "sm18807181", "model": 0, "human": 0}
{"aid": 1847414, "title": "【初音ミク】 光谱 【Falconnect】", "tags": ["VOCALOID", "初投稿", "初音MIKU", "字幕付", "ハヤブサ", "FALCONNECT", "キミスペクトル"], "author_info": "糜酱: 〓如视频侵权请私信我删〓\r\n\r\n 视频↙可调原画。 下载自助 http://www.bilibilijj.com/oldindex\r\n\r\n「何も知らずに勝手に誤解されるのはごめんだ。」\r\n", "description": "sm25250566 标题求翻译; \r\n【Falconnect】\r\n■music :ハヤブサ\r\n■illust :桜木蓮\r\n■movie :まつらい\r\nカラオケはこちら。https://www.dropbox.com/home/Vocal%20off%20Track", "model": 0, "human": 0}
{"aid": 430054078, "title": "【オリジナル曲】孤独回廊 イナガシ feat.初音ミク【ニコニコ転載】", "tags": ["转载", "日语", "MIKU", "NICONICO", "初音MIKU", "初音", "公主殿下", "初音未來"], "author_info": "定電流: ^%t&#a", "description": "https://www.nicovideo.jp/watch/sm40996727", "model": 0, "human": 0}
{"aid": 977788326, "title": "【东方栀子Era】最美的期待【VocalSharp COVER】", "tags": ["东方栀子", "VocalSharp", "东方栀子十周年投稿祭"], "author_info": "口古口古口古郁: ", "description": "原版:周笔畅《最美的期待》\n翻唱东方栀子Era\n\nVS巨好用摸鱼飞快\n早上起来突然想摸要不是出去走亲戚中午就能投了焯", "model": 2, "human": 2}
{"aid": 295971472, "title": "一梦千宵【乐正绫cover】", "tags": ["必剪创作", "2022Vsinger创作激励计划第一期"], "author_info": "ViVi141: ", "description": "较上个视频优化了一下混音", "model": 2, "human": 2}
{"aid": 630517376, "title": "【ニコカラ】パラジクロロベンゼン(对二氯苯)(off vocal)【鏡音レン】", "tags": ["VOCALOID", "对二氯苯", "镜音连", "パラジクロロベンゼン", "オワタP", "やさ", "完蛋P", "nicokara"], "author_info": "诳原糸: なぜ日は傾くのか", "description": "sm8346614\n自身の持ち込み用に作成させて頂きました。\n\nお借りした原曲様→sm8269164\n作ってみたリスト→mylist/9495351\n\n【10/6追記】\non vocal版作成させて頂きました→sm8438926\n——やさ", "model": 0, "human": 0}
{"aid": 34280265, "title": "夜景", "tags": ["デフォ子", "鷹々音タカユキ"], "author_info": "neneneneneneneko: ", "description": "sm34052035\n参加音源 デフォ子\n「ナカ番」さんの感じ出したかったけど微妙かなー \n今回 歌詞表示してません\nあと イントロ長いです\n\nツイッター@pz0Z2IxidBJbNyk", "model": 0, "human": 0}
{"aid": 1353871111, "title": "可不也入飞门了 《最佳顺友》【CeVIO AI 粤语cover】", "tags": ["搞笑", "英雄联盟", "飞门", "CeVIO AI", "飞马", "张顺飞", "可不", "原神"], "author_info": "事隠しのfubuki: ", "description": "原作者 :原家门-杰顺克 本家BV1sq421P7bC\n感谢老哥授权二创可惜可不把老哥的奥德腮皮肤遮住了\n\n混/调:我\n\n可不图作者推特https://x.com/noizman_sympho\n\nmocha pro 总是跟踪失败,用剪映摆了,这几天脑子里全是这歌,太顶级了", "model": 2, "human": 2}
{"aid": 248109170, "title": "传说之下游戏Treo的尘埃的信任一段开头演示官方重置 Dusttrust phase 1 thingie", "tags": ["BGM", "AU", "传说之下", "UT", "UNDERTALEAU"], "author_info": "三尘-深渊: 画渣无小号只在B站发视频和动态", "description": "该视频作者 Treo 3340位订阅者\n原视频链接https://m.youtube.com/watch?v=1F3EpHU1vnI", "model": 0, "human": 0}
{"aid": 113233474097330, "title": "【永夜Minus】Bleeding Love", "tags": ["让虚拟歌手为你唱", "虚拟歌姬", "虚拟歌手", "音乐", "VOCALOID", "Synthesizer V", "永夜Minus", "虚拟之声创作计划·2024第三期", "翻调", "Leona Lewis", "Bleeding Love"], "author_info": "五等分的monop: 菜菜调校", "description": "原曲Bleeding Love\n原唱Leona Lewis\n-----------------------------------------------------\n-----------------------------------------------------\n调校monop\n混音monop", "model": 2, "human": 0}
{"aid": 115025, "title": "【初音ミク】Re:birth ☆实长: 3:19", "description": "sm15005742 ☆实际长度: 3:19 ஐMusic&Lyricsキョーヘイ ஐIllustLiLLa 久し振りに病んだ曲になってますw.こんな曲は作ってて楽しいですねww Re:birth?重生?似乎可以这样理解吧?ˋ ( ° ▽、° )", "tags": ["初音MIKU", "VOCALOID", "キョーヘイ", "VOCALOID新曲リンク"], "author_info": "雪暴公主: Forever Alone", "url": "https://www.bilibili.com/video/BV1Qx411c7GE", "model_label": 0, "user_label": null, "human": 0}
{"aid": 112657831036502, "title": "冰火歌会 2", "description": "-", "tags": ["冰火歌会", "必剪创作", "虚拟 UP 主", "虚拟最强音·2024第三期"], "author_info": "LV驴子: ", "url": "https://www.bilibili.com/video/BV1AegbezEC8", "model_label": 0, "user_label": null, "human": 0}
{"aid": 732450889, "title": "【洛天依AI】四季予你【ACE Studio COVER】", "description": "四季予你\n作词烟十八\n作曲郑冰冰\n编曲刘圣华华子\n原唱程响\n翻唱洛天依\n调教夏龙杰lndulge\n求大佬评论指导", "tags": ["让虚拟歌手为你唱", "VOCALOID CHINA", "COVER", "洛天依", "虚拟歌手创意音乐大赏"], "author_info": "夏龙杰lndulge: 又名怠惰p高中生约稿B站私信可约调教和混音混音最好别约不求关注但求三连加评论 (能关注一下也可以)", "url": "https://www.bilibili.com/video/BV1VD4y1x7c1", "model_label": 2, "user_label": null, "human": 2}
{"aid": 310042157, "title": "【心华翻唱】祖龙吟【忘川风华录】", "description": "总策划:木宁木蒙\n企划运营塔库\n监制落落无尘、卿雅、小仙、顾雪柔\n作曲陈亦洺 | 作词:玄天 | 编曲Mzf小慕\n演唱星尘 | 调教:花儿不哭 | 混音Mr.曾经 | 笛子:囚牛\n视频小约酱【麻薯映像】| 曲绘:白鄔東\n朕之功过当世无可评说者百代之后自有论断。\n\n原作指路;BV11b411k7ym\n\n感谢忘川的公开二创素材调了一个半月了给孩子一个三连吧求求了。。。", "tags": ["虚拟之声创作计划", "COVER", "心华", "翻唱", "VOCALOID", "VOCALOID CHINA", "嬴政", "忘川风华录", "祖龙吟", "虚拟之声创作计划·2023第一期"], "author_info": "幽影丶鎺: 一只鸽子精,约稿事宜请私信哦~咕咕咕", "url": "https://www.bilibili.com/video/BV1UA411172s", "model_label": 2, "user_label": null, "human": 2}

33
filter/db_utils.py Normal file
View File

@ -0,0 +1,33 @@
import sqlite3
import json
def fetch_entry_data(db_path, aid):
"""
根据aid从数据库中加载data
"""
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute("SELECT data FROM bili_info_crawl WHERE aid = ?", (aid,))
fet = cursor.fetchone()
if fet:
data = fet[0]
else:
data = None
conn.close()
return data
def parse_entry_data(data):
"""
解析JSON数据提取视频标题简介标签作者简介
"""
try:
obj = json.loads(data)
title = obj["View"]["title"]
description = obj["View"]["desc"]
tags = [tag["tag_name"] for tag in obj["Tags"] if tag["tag_type"] in ["old_channel", "topic"]]
author_info = obj["Card"]["card"]["name"] + ": " + obj["Card"]["card"]["sign"]
url = "https://www.bilibili.com/video/" + obj["View"]["bvid"]
return title, description, tags, author_info, url
except (TypeError, json.JSONDecodeError) as e:
print(f"Error parsing data: {e}")
return None, None, None, None, None

View File

@ -6,6 +6,7 @@ import sys
import tty
import termios
import argparse
from db_utils import fetch_entry_data, parse_entry_data
# 数据库路径
DATABASE_PATH = "./data/main.db"
@ -45,42 +46,12 @@ def fetch_random_aids(db_path, num_entries=10, start_from=None):
conn.close()
return aids
def fetch_entry_data(db_path, aid):
"""
根据aid从数据库中加载data
"""
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute("SELECT data FROM bili_info_crawl WHERE aid = ?", (aid,))
fet = cursor.fetchone()
if fet:
data = fet[0]
else:
data = None
conn.close()
return data
def parse_entry_data(data):
"""
解析JSON数据提取视频标题简介标签作者简介
"""
try:
obj = json.loads(data)
title = obj["View"]["title"]
description = obj["View"]["desc"]
tags = [tag["tag_name"] for tag in obj["Tags"] if tag["tag_type"] in ["old_channel", "topic"]]
author_info = obj["Card"]["card"]["name"] + ": " + obj["Card"]["card"]["sign"]
url = "https://www.bilibili.com/video/" + obj["View"]["bvid"]
return title, description, tags, author_info, url
except (TypeError, json.JSONDecodeError) as e:
print(f"Error parsing data: {e}")
return None, None, None, None, None
def label_entries(db_path, aids):
"""
标注工具展示条目信息等待用户输入标签
"""
labeled_data = []
label_counts = {0: 0, 1: 0, 2: 0}
for aid in aids:
data = fetch_entry_data(db_path, aid)
title, description, tags, author_info, url = parse_entry_data(data)
@ -88,6 +59,7 @@ def label_entries(db_path, aids):
continue
# 展示信息
os.system("clear")
print(f"Count: {label_counts[0]}, {label_counts[1]}, {label_counts[2]}")
print(f"AID: {aid}")
print(f"URL: {url}")
print(f"Title: {title}")
@ -105,6 +77,8 @@ def label_entries(db_path, aids):
continue
if label == "q": # 退出
break
# 更新计数
label_counts[int(label)] += 1
# 保存标注结果
labeled_data.append({
"aid": aid,

View File

@ -1,70 +1,173 @@
import os, json
import os
import json
import random
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"]="1"
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
import torch
from modelV3_4 import VideoClassifierV3_4
from sentence_transformers import SentenceTransformer
from tag import getch
import sys
import tty
import termios
from db_utils import fetch_entry_data, parse_entry_data
def predict(json_input):
# 加载模型
model = VideoClassifierV3_4()
model.load_state_dict(torch.load('./filter/checkpoints/best_model_V3.8.pt'))
model.eval()
DATABASE_PATH = "./data/main.db"
BATCH_SIZE = 50 # 动态加载批次大小
# 加载SentenceTransformer
sentence_transformer = SentenceTransformer("Thaweewat/jina-embedding-v3-m2v-1024")
class LabelingSystem:
def __init__(self):
# 初始化模型
self.model = VideoClassifierV3_4()
self.model.load_state_dict(torch.load('./filter/checkpoints/best_model_V3.8.pt'))
self.model.eval()
self.sentence_transformer = SentenceTransformer("Thaweewat/jina-embedding-v3-m2v-1024")
input_texts = {
"title": [json_input["title"]],
"description": [json_input["description"]],
"tags": [" ".join(json_input["tags"])],
"author_info": [json_input["author_info"]]
# 数据相关
self.existing_entries = self._load_existing_entries()
self.existing_aids = set(entry['aid'] for entry in self.existing_entries)
self.candidate_pool = []
self.history = []
self.current_index = -1 # -1表示未开始
# 初始化第一批数据
self._load_more_candidates()
def _save_entry(self, entry):
"""保存或更新条目"""
# 查找是否已存在
existing_index = next((i for i, e in enumerate(self.existing_entries)
if e['aid'] == entry['aid']), None)
# 更新或添加条目
if existing_index is not None:
self.existing_entries[existing_index] = entry
else:
self.existing_entries.append(entry)
# 重写整个文件
with open("./data/filter/real_test.jsonl", "w") as fp:
for entry in self.existing_entries:
fp.write(json.dumps(entry, ensure_ascii=False) + "\n")
def _load_existing_entries(self):
"""加载已有条目"""
if not os.path.exists("./data/filter/real_test.jsonl"):
return []
with open("./data/filter/real_test.jsonl", "r") as fp:
return [json.loads(line) for line in fp]
def _load_more_candidates(self):
"""动态加载更多候选数据"""
with open('data/filter/model_predicted.jsonl', 'r') as fp:
new_candidates = []
for line in fp:
entry = json.loads(line)
if entry['aid'] not in self.existing_aids:
new_candidates.append(entry['aid'])
# 随机打乱后取批次
random.shuffle(new_candidates)
self.candidate_pool.extend(new_candidates[:BATCH_SIZE])
del new_candidates[:BATCH_SIZE] # 释放内存
def _get_entry_details(self, aid):
"""获取条目详细信息并预测模型标签"""
# 获取元数据
title, description, tags, author_info, url = parse_entry_data(
fetch_entry_data(DATABASE_PATH, aid)
)
# 模型预测
with torch.no_grad():
logits = self.model(
input_texts={
"title": [title],
"description": [description],
"tags": [" ".join(tags)],
"author_info": [author_info]
},
sentence_transformer=self.sentence_transformer
)
model_label = torch.argmax(logits, dim=1).item()
return {
'aid': aid,
'title': title,
'description': description,
'tags': tags,
'author_info': author_info,
'url': url,
'model_label': model_label,
'user_label': None
}
# 预测
with torch.no_grad():
logits = model(
input_texts=input_texts,
sentence_transformer=sentence_transformer
)
pred = torch.argmax(logits, dim=1).item()
def _display_entry(self, entry):
"""显示条目信息"""
os.system("clear")
print(f"AID: {entry['aid']}")
print(f"URL: {entry['url']}")
print(f"Title: {entry['title']}")
print(f"Tags: {', '.join(entry['tags'])}")
print(f"Author Info: {entry['author_info']}")
print(f"Description: {entry['description']}")
print(f"\nModel Prediction: {entry['model_label']}")
if entry['user_label'] is not None:
print(f"Your Label: {entry['user_label']}")
return pred
def run(self):
while True:
# 处理当前条目
if self.current_index < 0:
self.current_index = 0
if self.current_index >= len(self.history):
if not self.candidate_pool:
self._load_more_candidates()
if not self.candidate_pool:
print("\nAll entries processed!")
return
# 处理新条目
aid = self.candidate_pool.pop(0)
entry = self._get_entry_details(aid)
self.history.append(entry)
self.current_index = len(self.history) - 1
current_entry = self.history[self.current_index]
self._display_entry(current_entry)
# 获取用户输入
print("\nLabel (0/1/2, s=skip, ←↑/→↓=nav, q=quit): ", end="", flush=True)
cmd = getch().lower()
# 处理导航命令
if cmd in ['left', 'up']:
self.current_index = max(0, self.current_index - 1)
elif cmd in ['right', 'down']:
self.current_index += 1
elif cmd in ('0', '1', '2'):
current_entry['human'] = int(cmd)
self._save_entry(current_entry)
self.current_index += 1 # 自动前进
elif cmd == 's':
self.current_index += 1 # 跳过
elif cmd == 'q':
return
def getch():
"""支持方向键检测的输入函数"""
fd = sys.stdin.fileno()
old_settings = termios.tcgetattr(fd)
try:
tty.setraw(fd)
ch = sys.stdin.read(1)
if ch == '\x1b':
seq = sys.stdin.read(2)
return {'[A': 'up', '[B': 'down', '[C': 'right', '[D': 'left'}.get(seq, 'unknown')
return ch
finally:
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
if __name__ == "__main__":
with open('data/filter/model_predicted.jsonl', 'r') as fp:
data = [json.loads(line) for line in fp.readlines()]
sampled = random.sample(data, min(200, len(data)))
test_data = []
for sample in sampled:
label = sample['label']
os.system("clear")
print(f"AID: {sample['aid']}")
print(f"Title: {sample['title']}")
print(f"Tags: {', '.join(sample['tags'])}")
print(f"Author Info: {sample['author_info']}")
print(f"Description: {sample['description']}")
# 等待用户输入
while True:
print("Label (0 or 1 or 2, s to skip, q to quit): ", end="", flush=True)
real_label = getch().lower()
if real_label in ["0", "1", "2", "s", "q"]:
break
print("\nInvalid input. Please enter 0, 1, 2, s or q.")
if real_label == "s": # 跳过
continue
if real_label == "q": # 退出
break
test_data.append({
"aid": sample['aid'],
"title": sample['title'],
"tags": sample['tags'],
"author_info": sample['author_info'],
"description": sample['description'],
"model": label,
"human": int(real_label)
})
with open("./data/filter/real_test.jsonl", "a") as fp:
fp.writelines([json.dumps(item, ensure_ascii=False) + "\n" for item in test_data])
labeling_system = LabelingSystem()
labeling_system.run()