This notebook demonstrates how I visualize binary features & clustering
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
def is_std_perm(p): return p.startswith('android.permission.') or p.startswith('com.android.')
def elide(s, l=20):
s = s[:-4] # remove extension
s = s[:s.find('_')] # remove _<vercode>
if len(s) <= l:
return s
h = l / 2
return s[:h] + '..' + s[-h:]
dataset = pd.read_csv('crawled-apks.csv')
dataset.fillna(0, inplace=True)
# remove some samples
dataset.drop(np.random.choice(dataset.index, int(len(dataset) * 0.5), replace=False), inplace=True)
dataset.reset_index(drop=True, inplace=True)
# remove columns which are completely all zero
dataset = dataset.ix[:,(dataset != 0).any(axis=0)]
# remove columns with non-standard permissions
dataset = dataset[[c for c in dataset.columns if not c.startswith('p_') or is_std_perm(c[2:])]]
# length of filenames
dataset['_file'].apply(len).describe()
count 1401.000000 mean 33.226981 std 8.747727 min 14.000000 25% 27.000000 50% 32.000000 75% 38.000000 max 72.000000 dtype: float64
# visualize sample data
d = dataset.ix[np.random.choice(dataset.index, 50)]
d = d.reset_index(drop=True)
fig, ax = plt.subplots()
fig.set_size_inches(fig.get_size_inches() * (1.5, 1.6))
plt.yticks(d.index, d['_file'].apply(elide), fontsize='small')
ax.imshow(d[:50][[c for c in d.columns if c != '_file']],
aspect='auto', cmap=plt.cm.gray_r, interpolation='none')
<matplotlib.image.AxesImage at 0x15d58bd0>
# visualize data
fig, ax = plt.subplots()
fig.set_size_inches(fig.get_size_inches() * (2.5, 15))
plt.yticks(dataset.index, dataset['_file'].apply(elide), fontsize='small')
ax.imshow(dataset[[c for c in dataset.columns if c != '_file']], aspect='auto', cmap=plt.cm.gray_r, interpolation='none')
<matplotlib.image.AxesImage at 0x92837d0>
# plot frequency of permission & features used by apps
plt.plot(dataset.mean())
[<matplotlib.lines.Line2D at 0x15d7ef10>]
dataset.mean().order(ascending=False)
p_android.permission.INTERNET 0.925054 p_android.permission.ACCESS_NETWORK_STATE 0.850821 p_android.permission.WRITE_EXTERNAL_STORAGE 0.640971 p_android.permission.READ_PHONE_STATE 0.442541 p_android.permission.WAKE_LOCK 0.344754 p_android.permission.ACCESS_WIFI_STATE 0.326196 p_android.permission.ACCESS_FINE_LOCATION 0.304069 p_android.permission.ACCESS_COARSE_LOCATION 0.292648 p_android.permission.VIBRATE 0.278373 p_android.permission.GET_ACCOUNTS 0.229836 p_android.permission.RECEIVE_BOOT_COMPLETED 0.171306 p_android.permission.CAMERA 0.121342 p_com.android.vending.BILLING 0.118487 p_android.permission.GET_TASKS 0.114918 p_android.permission.READ_CONTACTS 0.108494 ... p_android.permission.SMARTCARD 0.000714 p_android.permission.ACCESS_CORSE_LOCATION 0.000714 p_android.permission.NETWORK_STATE 0.000714 p_android.permission.NETWORK 0.000714 p_android.permission.FLAG_ACTIVITY_NEW_TASK 0.000714 p_android.permission.ACCESS_CHECKIN_PROPERTIES 0.000714 p_android.permission.FLAG_SHOW_WHEN_LOCKED 0.000714 p_android.permission.ACCESS_ALL_DOWNLOADS 0.000714 f_lge.hardware.real3d.barrier.landscape 0.000714 p_android.permission.WRITE_SYNC_STATS 0.000714 f_android.permission.ACCESS_FINE_LOCATION 0.000714 f_android.permission.ACCESS_COARSE_LOCATION 0.000714 p_android.permission.SILENT 0.000714 f_android.hardware.usb.host 0.000714 p_android.permission.WRITE_SD_CARD 0.000714 Length: 214, dtype: float64
X = dataset[[c for c in dataset.columns if not c.startswith('_')]]
clustering = AgglomerativeClustering(n_clusters=10)
clustering.fit(X)
datasetC = dataset.copy()
datasetC['_label'] = pd.DataFrame(clustering.labels_, index=dataset.index)
# reset the indices
datasetC.sort('_label', inplace=True)
datasetC.reset_index(drop=True, inplace=True)
fig, ax = plt.subplots()
fig.set_size_inches(fig.get_size_inches() * (2.5, 10))
plt.yticks(datasetC.index, datasetC['_file'].apply(elide), fontsize='small')
# visualize clusters
for label, rows in datasetC.groupby('_label').groups.iteritems():
r = sorted(rows)
start, end = r[0], r[-1]
# separator line & text label
ax.axhline(end + 0.5, lw=2, color='blue', alpha=0.4)
ax.text(.4 * len(datasetC.columns), start + .5 * (end - start), '%d' % label,
fontsize=30, fontweight='bold', va='center', color='blue', alpha=0.3)
ax.imshow(datasetC[[c for c in datasetC.columns if not c.startswith('_')]],
aspect='auto', cmap=plt.cm.gray_r, interpolation='none')
<matplotlib.image.AxesImage at 0x39a117d0>
for label, rows in datasetC.groupby('_label').groups.iteritems():
print 'Group %d' % label
print datasetC.ix[rows]['_file']
print
Group 0 0 com.tonyken.cardiologybooks_2_1.apk 1 com.joelapenna.foursquared_2013120501_1.apk 2 cn.rtfsc.searchmanga_66_1.apk 3 com.jiran.weatherlocker_30_1.apk 4 com.jham.wpweather_2_1.apk 5 com.mobiquest.household_2_1.apk 6 com.jham.weatherhkrt_7_1.apk 7 com.leadapps.android.radio_15_1.apk 8 tv.pps.tpad_160_1.apk 9 tv.weikan_200_1.apk 10 com.nbaimd.gametime.nba2011_22_1.apk 11 org.traintickets.act_24_1.apk 12 sph.omy.news_12_1.apk 13 smr.mswr13.ui_2_1.apk 14 blue.sexy.girls.wallpaper.hd_3_1.apk ... 169 com.singtel.mysingtel_55_1.apk 170 com.rails_50_1.apk 171 com.farasbee.isure_2_1.apk 172 com.burningpassion.hindidictionary_6_1.apk 173 coupon.voucher.voupons_10_1.apk 174 com.symapp.christmasclock1_3_1.apk 175 com.trustmobi_19_1.apk 176 com.ttnet.muzik_8_1.apk 177 com.tunepickermp3downloader_1_1.apk 178 com.unips.livewallpaper.colorfulrose_1_1.apk 179 com.deng.yakultpotoyes5_10000_1.apk 180 com.yukka.livewallpaper.ctree2_3_1.apk 181 com.trovit.android.apps.jobs_24_1.apk 182 com.dendicapp.dentaldictionarylite_3_1.apk 183 com.ddinfoapp.diseasedrugsinformation_4_1.apk Name: _file, Length: 184, dtype: object Group 1 184 com.palmarysoft.forecaweather_106_1.apk 185 com.picsart.studio_78_1.apk 186 com.greencopper.android.wavesvienna_2_1.apk 187 com.SuperPantry.finder_1_1.apk 188 com.ada.deals_13_1.apk 189 com.parkopedia_197_1.apk 190 com.singapore.trafficreport.v1_1_1.apk 191 com.ms.parlamento_22_1.apk 192 com.skino.c2dm_19_1.apk 193 com.appgenindy.doraemonthe_1_1.apk 194 net.GTunesmp3Download_1_1.apk 195 com.avatarabridged_1_1.apk 196 com.espnf_9_1.apk 197 com.icreativemobile.dragoncitycheats_1_1.apk 198 com.secdev.sihirbaznumaralari_1_1.apk 199 cat.ereza.properbusbcn_14_1.apk 200 com.app_eof.layout_401_1.apk 201 ru.afisha.android_2330_1.apk 202 com.pilottravelcenters.mypilot_16_1.apk 203 com.national.geographic.cjurovzqtruyybdhqw_17_... 204 com.navigaweb_55_1.apk 205 com.audioguidia.myweather_23_1.apk 206 com.google.earth_13294050_1.apk 207 com.baaddin_10_1.apk 208 com.beautifulapps.applockex_20_1.apk 209 com.app_freebet.layout_399_1.apk 210 com.peppa.pig_51_1.apk 211 com.onlinewerkz.topsingaporenews_9_1.apk 212 com.app_askustaz.layout_401_1.apk 213 com.btcc.crazy_22_1.apk 214 dibr.works.ovh_30_1.apk 215 com.dunte.nithandek_4_1.apk 216 com.mobileestate.mobileestateandroid_6_1.apk 217 com.cibc.android.mobi_6_1.apk 218 com.maherenstein.epl.news_17_1.apk 219 com.lwl.ubah_3_1.apk 220 com.yahoo.news.swfgerineoldwklasig_3_1.apk 221 com.bredir.boopsie.slcolib_128_1.apk 222 de.hafas.android.arriva_100010_1.apk 223 com.macropinch.swan_340027_1.apk 224 com.dev.geo_1_1.apk 225 com.cleartrip.android_14_1.apk 226 de.mdiener.rain_107_1.apk 227 com.secdev.ucgenvucutrehberi_1_1.apk 228 com.greencopper.android.nxne_3_1.apk 229 com.app_yoursmine.layout_303_1.apk 230 jun.jobscentral_6_1.apk 231 com.insituconcept.cnsdijonfr_12_1.apk 232 com.dory.alkahtani_1_1.apk 233 com.mobilesrepublic.qnet_332_1.apk 234 de.mdiener.rain.usa_107_1.apk 235 au.com.angryrobot.wikicamps_33_1.apk 236 com.we.chat_4_1.apk Name: _file, Length: 53, dtype: object Group 2 237 com.jb.gosms_163_1.apk 238 kr.co.tictocplus_30004_1.apk 239 com.doubleTwist.androidPlayer_20303_1.apk 240 com.jiubang.goscreenlock.plugin.side_3_1.apk 241 com.iconnect.app.pts_353_1.apk 242 com.mobiwork.devices.android_178_1.apk 243 cn.kuwo.player_5690_1.apk 244 me.papa_31100_1.apk 245 cn.anyradio.pad_86_1.apk 246 com.aacmcc.mobilevideo_4000102_1.apk 247 com.beejive.im.fbchat_99_1.apk 248 com.antivirus_186306_1.apk 249 ru.yandex.shell_42854_1.apk 250 cn.jingling.motu.photowonder_101_1.apk 251 com.oovoo_2604_1.apk 252 cn.etouch.ecalendar2_425_1.apk 253 com.gau.go.touchhelperex.theme.imusic_3_1.apk 254 com.tencent.qqpimsecure_146_1.apk 255 com.gamed9.hero.ensea_7_1.apk 256 com.gau.go.launcherex.gowidget.switchwidget_42... 257 com.cornermation.calltaxi_16_1.apk 258 com.ijinshan.kbatterydoctor_en_4030107_1.apk 259 com.lx.launcher8_48_1.apk 260 com.sina.weibo_650_1.apk 261 com.tencent.mobileqq_82_1.apk 262 com.mixzing.basic_54_1.apk 263 com.tencent.mm_355_1.apk 264 com.hi.applock2_21_1.apk 265 com.facebook.katana_666395_1.apk 266 com.facebook.orca_663362_1.apk 267 com.topi_1311261421_1.apk 268 com.netqin.ps_48_1.apk 269 com.kakao.talk_138_1.apk 270 com.lslk.sleepbot_73_1.apk 271 com.estrongs.android.pop_124_1.apk 272 com.fring_236_1.apk Name: _file, dtype: object Group 3 273 com.outfit7.talkingtom_63_1.apk 274 com.outfit7.talkingben_43_1.apk 275 org.videolan.vlc.betav7neon_1400_1.apk 276 com.aastudio.talkshow_1_1.apk 277 com.jbl.android.spotimote_72_1.apk 278 pl.eclicto.android_146_1.apk 279 com.ivona.tts_353_1.apk 280 com.jrtstudio.music_22_1.apk 281 com.jumplife.tvdrama_51_1.apk 282 com.film.instaliker_3_1.apk 283 com.wei.ytseries_34_1.apk 284 com.ThunderVPN.ThunderVPNFree_3_1.apk 285 com.outfit7.tomslovelettersfree_41_1.apk 286 com.elift.hdplayer_163_1.apk 287 com.notabasement.mangarock.android.titan_7_1.apk ... 402 pl.sport.live_172_1.apk 403 com.gameloft.android.ANMP.GloftDMHM_140_1.apk 404 com.gameloft.android.ANMP.GloftIAHM_2100_1.apk 405 com.gameloft.android.ANMP.GloftRAHM_10201_1.apk 406 com.gau.go.launcherex.gowidget.timer_1_1.apk 407 com.smule.magicpiano_133_1.apk 408 com.metago.astro_564_1.apk 409 com.smartwho.SmartAllCurrencyConverter_8_1.apk 410 jp.naver.lineplay.android_48_1.apk 411 org.espier.widget.imusic_7_1.apk 412 com.igg.castleclash_1200240_1.apk 413 org.coolreader_854_1.apk 414 com.sleekbit.ovuview_69_1.apk 415 com.fz.play.football.free_6_1.apk 416 com.skysoft.kkbox.android_40600_1.apk Name: _file, Length: 144, dtype: object Group 4 417 com.huahin.nuttysoft.parmantube_1_1.apk 418 com.gmail.chen.linus.miu.hd_4_1.apk 419 com.Global.sports_2_1.apk 420 com.gearnbulb.nhm_2_1.apk 421 com.gdoc.sync_1_1.apk 422 cc.co.eurdev.urecorder_5_1.apk 423 com.gymradio.radio_app_2549_1.apk 424 com.herman.ringtone_58_1.apk 425 com.gonglue.xieemanhua7_2_1.apk 426 biz.neoline.neobook_280_1.apk 427 com.hket.android.ezone_6_1.apk 428 com.hthk.ThreeShortCode_4_1.apk 429 com.hz.airpollutionhk_23_1.apk 430 book.revisiongroup.brockengag_9_1.apk 431 ch.aits.android.kdroid_22_1.apk ... 677 com.tete.wifiadb_3_1.apk 678 com.su.emojisticker_maker_vi_fr.pay_10_1.apk 679 gextreme.app44_1_1.apk 680 gextreme.app45_1_1.apk 681 iec.framesforever.free_11120_1.apk 682 com.szyk.myheart_2306_1.apk 683 it.animevoice_4_1.apk 684 fr.ismin.magnetpd_6_1.apk 685 com.supo.pocket.mangareader_6_1.apk 686 es.shimio.android.sagalives_2013110600_1.apk 687 com.sigusigu.recordone_5_1.apk 688 KaimonoHikakuMemo.littleap_203_1.apk 689 com.the7art.skatingrinkwallpaper_1014_1.apk 690 com.siyami.apps.ia_270_1.apk 691 com.tfd.mobile.TfdSearch_33_1.apk Name: _file, Length: 275, dtype: object Group 5 692 com.getunik.aha.pollen_8_1.apk 693 com.griffin.client.allindia_5_1.apk 694 com.finestandroid.autostopwatch_9_1.apk 695 com.glassdoor.app_21_1.apk 696 com.newspaperfrontpage.usa_1_1.apk 697 com.gjmatw.android.app_171_1.apk 698 com.gn.sgflight_7_1.apk 699 com.a1697157791516e6ca1985da3a.a24985223a_1_1.apk 700 com.freelancewatcher_18_1.apk 701 ru.mobimoney.tele2mastercard_3_1.apk 702 at.fhooe.mc.android_3_1.apk 703 com.foxsports.android_14_1.apk 704 com.snowballingsoftware.snowballingsavings_2_1... 705 streetdirectory.mobile_22_1.apk 706 com.bit.seagames_2_1.apk ... 834 com.websoo.taxi.meter.hk_27_1.apk 835 com.myappatory.busbuddy_9_1.apk 836 jp.gr.java_conf.ketupablakistoni.android.joggi... 837 jay.jaewon.USAJobFinder_2_1.apk 838 com.chope.gui_10_1.apk 839 sg.com.coldstorage_1_1.apk 840 mb.cmm.afgimmo_5_1.apk 841 com.barchart.app_3_1.apk 842 com.calmary.maarygil_100100_1.apk 843 com.zoomash.weatherfree_4_1.apk 844 com.cabsg_91111_1.apk 845 com.mfluid.LocationMangerGmap_7_1.apk 846 com.zzangpenguin3_1_1.apk 847 com.idmobile.rumeteo_3_1.apk 848 fr.tvbarthel.apps.simplethermometer_4_1.apk Name: _file, Length: 157, dtype: object Group 6 849 com.kth.PuddingCamera_40_1.apk 850 com.keniu.pai_8_1.apk 851 com.trailblazing.singapore_1_1.apk 852 com.grabtaxi.driver_101_1.apk 853 me.scan.android.client_19_1.apk 854 com.cyworld.camera_47_1.apk 855 com.flash.light.blink.on.call.alert.sms_5_1.apk 856 com.rikkeisoft.diary_8_1.apk 857 com.discovercircle10_68_1.apk 858 com.surpax.ledflashlight.panel_3_1.apk 859 com.web.webmmrelax_56_1.apk 860 ru.tcsbank.wallet_101_1.apk 861 ru.yandex.market_211_1.apk 862 me.moment.momentme_36_1.apk 863 com.myfitnesspal.android_3862_1.apk ... 900 my.beautyCamera_50_1.apk 901 com.google.android.youtube_5324_1.apk 902 com.buuuk.ntuc_23_1.apk 903 cn.opda.android.softwarelock_210_1.apk 904 aw.awesomewidgets.cutekittycat_5_1.apk 905 com.baby.photocamera_5_1.apk 906 tipsto.lose.weight_5_1.apk 907 thecouponsapp.coupon_942_1.apk 908 com.sergiobarbara.nightvision.camera_180_1.apk 909 com.geoactio.tus_2_1.apk 910 ch.sunstore.sunstore_11_1.apk 911 com.gau.go.launcherex.gowidget.gobarcodescanne... 912 com.sohu.inputmethod.sogou_220_1.apk 913 yeshua.idphotocreator.demo_1_1.apk 914 com.snapchat.android_166_1.apk Name: _file, Length: 66, dtype: object Group 7 915 com.osbelsoftware.GenderPredictor_2_1.apk 916 com.eterno_59_1.apk 917 com.twopick.discount_tour_1_1.apk 918 com.jb.gokeyboard.langpack.zh_hk_5_1.apk 919 com.jb.gosms.pctheme.rosesms_1_1.apk 920 com.iggy.mobile.metrodeal_2_1.apk 921 com.loveculture_1_1.apk 922 com.helectronsoft.livewallpaper.christmas.hd.f... 923 com.blogspot.droindman.golden_3_1.apk 924 com.zl.dictionary.jelly.pt_br_4_1.apk 925 com.gau.go.weatherex.systemwidgetskin.defaults... 926 com.loudcrow.marvelavengers_3_1.apk 927 com.teslacoilsw.notifier_500_1.apk 928 com.keb.android.msbreal_24_1.apk 929 com.twistandroid.snowall_2_1.apk ... 1036 com.ctrlb.talkinterval_5_1.apk 1037 uk.co.senab.actionbarpulltorefresh.samples.sto... 1038 com.hi.applock.theme.gold_4_1.apk 1039 rondroid.bluemine_21_1.apk 1040 info.afrand.android.almahdi.azva_1_1.apk 1041 com.alpha.learnalpha_1_1.apk 1042 com.bbbz.nexus5lwp_14_1.apk 1043 air.trispurmv.ChineseSongs80s_1000000_1.apk 1044 pkeyboard.languagepack.uk_2_1.apk 1045 fr.lesparticuliers.mobile_5_1.apk 1046 org.topixoft.market_test.no_limitations_1_1.apk 1047 jp.dip.gpsoft.bottomkeeper_5_1.apk 1048 ZXStyles.ZXReader_224_1.apk 1049 com.oslowski.SiedlceMeteo2012_31_1.apk 1050 mobi.dotit.dotkitchen_10_1.apk Name: _file, Length: 136, dtype: object Group 8 1051 com.my.mail_4112_1.apk 1052 com.yahoo.mobile.client.android.im_15957_1.apk 1053 com.kfactormedia.mycalendarmobile_86_1.apk 1054 com.anydo.cal_29_1.apk 1055 com.bbm_83_1.apk 1056 com.linkedin.android_72_1.apk 1057 com.twitter.android_3000473_1.apk 1058 de.motain.iliga_40030011_1.apk 1059 com.google.android.music_1301_1.apk 1060 com.yahoo.mobile.client.android.mail_1311661_1... 1061 com.companionlink.clusbsync_518_1.apk 1062 com.wsl.noom_146_1.apk 1063 com.google.android.maps.mytracks_70_1.apk 1064 com.hipmunk.android_76_1.apk 1065 com.soundcloud.android_74_1.apk 1066 com.expedia.bookings_32_1.apk 1067 com.abc.abcnews_200040_1.apk 1068 com.evernote_1055303_1.apk 1069 com.android.chrome_1650059_1.apk 1070 com.shazam.android_80349_1.apk 1071 org.mozilla.firefox_2013111215_1.apk 1072 com.androidpcsynccom.androidpcsync_14_1.apk 1073 com.taskslendar_8_1.apk 1074 com.google.android.apps.docs_1246114_1.apk 1075 com.endomondo.android_115_1.apk 1076 com.google.android.apps.currents_131221719_1.apk Name: _file, dtype: object Group 9 1077 com.venture.labs.car.alarm.hacker.prank_3_1.apk 1078 com.astron.BmiCalculator_1_1.apk 1079 com.wYollar_1384150083_1.apk 1080 com.atomic.apps.medical.drug.dictionary_1_1.apk 1081 com.doeiqts.PTCGOCodeScanner_5_1.apk 1082 com.ATMB_159_1.apk 1083 com.andromo.dev86.app119_219_1.apk 1084 com.SunnyBox.Jabong2013_2_1.apk 1085 nl.picaktech.android.turkustan_1_1.apk 1086 com.flashmark.momancameraguide_1_1.apk 1087 org.androidworks.livewallpaperrosefree_40_1.apk 1088 com.amax.easterngardenlivewallpaper_1_1.apk 1089 com.xper.spotify.top_3_1.apk 1090 com.altibbi.ramdan_3_1.apk 1091 com.fastermobile_3_1.apk ... 1202 tr.com.innova.fta.mhrs_6_1.apk 1203 com.jerry.weather_6_1.apk 1204 com.jham.wpweather_1_1.apk 1205 tv.terry.ThaiTvOnline_1_1.apk 1206 com.netfortuna.iwanj.jobsearch_4_1.apk 1207 com.wSocialPinExchanger_1385581828_1.apk 1208 com.roarapps.singaporemrt_2_1.apk 1209 com.roidgame.MaybeBaby_4_1.apk 1210 com.naritasoft.yearview_4_1.apk 1211 com.lapema.prensasingapore_34_1.apk 1212 com.bmmGreetings_5_1.apk 1213 nooniHome.jaiGanpatiJi_2_1.apk 1214 com.sigara.project_6_1.apk 1215 com.roar.liveTraffic.liveTrafficHKFree_3_1.apk 1216 com.max.LeadershipQuotes_10_1.apk Name: _file, Length: 140, dtype: object