## HR Article classification Schema without ML company_names = ['Google', 'Microsoft', 'Apple'] job_titles = ['CFO', 'CEO', 'CFO'] text = 'Microsoft recently hired such and such person AS CFO' SOME_THRESHOLD = 20 confidence =0 ### HR article keywords if 'hire' in text or 'hiring' in text or 'join' or 'joining' in text or 'laying off' in text or 'resign' in text: confidence += 10 # Job titles are definitely good again, HR articles generally say the position of the new hire for job_title in job_titles: if job_title in text: confidence += 10 # If we have company name, that is a good sign as article could be in business domain for company_name in company_names: if company_name in text: confidence += 10 label = 0 if confidence >= SOME_THRESHOLD: label = 1 # It is an hr article if label == 1: print('-- {} -- is an HR article'.format(text)) else: print('-- {} -- is not an HR article'.format(text)) # Given trained classifier, vectorizer and feature selection method # This is how one may classify an article in Scikit-learn(assuming the classifier is also trained on labeled data) ## Convert into a vector count = vectorizer.transform(np.asarray(text).toarray()) ## Do feature selection selected_feats = feat_selector.transform(count) ## Algorithm to classify pred_class = clf.predict(selected_feats) if label == 1: print('-- {} -- is an HR article'.format(text)) else: print('-- {} -- is not an HR article'.format(text))